linux/net/core/dev.c
<<
>>
Prefs
   1/*
   2 *      NET3    Protocol independent device support routines.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 *      Derived from the non IP parts of dev.c 1.0.19
  10 *              Authors:        Ross Biro
  11 *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *      Additional Authors:
  15 *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *              David Hinds <dahinds@users.sourceforge.net>
  18 *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *              Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *      Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *                                      to 2 if register_netdev gets called
  25 *                                      before net_dev_init & also removed a
  26 *                                      few lines of code in the process.
  27 *              Alan Cox        :       device private ioctl copies fields back.
  28 *              Alan Cox        :       Transmit queue code does relevant
  29 *                                      stunts to keep the queue safe.
  30 *              Alan Cox        :       Fixed double lock.
  31 *              Alan Cox        :       Fixed promisc NULL pointer trap
  32 *              ????????        :       Support the full private ioctl range
  33 *              Alan Cox        :       Moved ioctl permission check into
  34 *                                      drivers
  35 *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36 *              Alan Cox        :       100 backlog just doesn't cut it when
  37 *                                      you start doing multicast video 8)
  38 *              Alan Cox        :       Rewrote net_bh and list manager.
  39 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40 *              Alan Cox        :       Took out transmit every packet pass
  41 *                                      Saved a few bytes in the ioctl handler
  42 *              Alan Cox        :       Network driver sets packet type before
  43 *                                      calling netif_rx. Saves a function
  44 *                                      call a packet.
  45 *              Alan Cox        :       Hashed net_bh()
  46 *              Richard Kooijman:       Timestamp fixes.
  47 *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48 *              Alan Cox        :       Device lock protection.
  49 *              Alan Cox        :       Fixed nasty side effect of device close
  50 *                                      changes.
  51 *              Rudi Cilibrasi  :       Pass the right thing to
  52 *                                      set_mac_address()
  53 *              Dave Miller     :       32bit quantity for the device lock to
  54 *                                      make it work out on a Sparc.
  55 *              Bjorn Ekwall    :       Added KERNELD hack.
  56 *              Alan Cox        :       Cleaned up the backlog initialise.
  57 *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58 *                                      1 device.
  59 *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60 *                                      is no device open function.
  61 *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62 *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63 *              Cyrus Durgin    :       Cleaned for KMOD
  64 *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65 *                                      A network device unload needs to purge
  66 *                                      the backlog queue.
  67 *      Paul Rusty Russell      :       SIOCSIFNAME
  68 *              Pekka Riikonen  :       Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *                                      indefinitely on dev->refcnt
  71 *              J Hadi Salim    :       - Backlog queue sampling
  72 *                                      - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
  84#include <linux/mutex.h>
  85#include <linux/string.h>
  86#include <linux/mm.h>
  87#include <linux/socket.h>
  88#include <linux/sockios.h>
  89#include <linux/errno.h>
  90#include <linux/interrupt.h>
  91#include <linux/if_ether.h>
  92#include <linux/netdevice.h>
  93#include <linux/etherdevice.h>
  94#include <linux/ethtool.h>
  95#include <linux/notifier.h>
  96#include <linux/skbuff.h>
  97#include <net/net_namespace.h>
  98#include <net/sock.h>
  99#include <linux/rtnetlink.h>
 100#include <linux/stat.h>
 101#include <net/dst.h>
 102#include <net/pkt_sched.h>
 103#include <net/checksum.h>
 104#include <net/xfrm.h>
 105#include <linux/highmem.h>
 106#include <linux/init.h>
 107#include <linux/module.h>
 108#include <linux/netpoll.h>
 109#include <linux/rcupdate.h>
 110#include <linux/delay.h>
 111#include <net/iw_handler.h>
 112#include <asm/current.h>
 113#include <linux/audit.h>
 114#include <linux/dmaengine.h>
 115#include <linux/err.h>
 116#include <linux/ctype.h>
 117#include <linux/if_arp.h>
 118#include <linux/if_vlan.h>
 119#include <linux/ip.h>
 120#include <net/ip.h>
 121#include <net/mpls.h>
 122#include <linux/ipv6.h>
 123#include <linux/in.h>
 124#include <linux/jhash.h>
 125#include <linux/random.h>
 126#include <trace/events/napi.h>
 127#include <trace/events/net.h>
 128#include <trace/events/skb.h>
 129#include <linux/pci.h>
 130#include <linux/inetdevice.h>
 131#include <linux/cpu_rmap.h>
 132#include <linux/static_key.h>
 133#include <linux/hashtable.h>
 134#include <linux/vmalloc.h>
 135#include <linux/if_macvlan.h>
 136#include <linux/errqueue.h>
 137#include <linux/hrtimer.h>
 138
 139#include "net-sysfs.h"
 140
 141/* Instead of increasing this, you should create a hash table. */
 142#define MAX_GRO_SKBS 8
 143
 144/* This should be increased if a protocol with a bigger head is added. */
 145#define GRO_MAX_HEAD (MAX_HEADER + 128)
 146
 147static DEFINE_SPINLOCK(ptype_lock);
 148static DEFINE_SPINLOCK(offload_lock);
 149struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 150struct list_head ptype_all __read_mostly;       /* Taps */
 151static struct list_head offload_base __read_mostly;
 152
 153static int netif_rx_internal(struct sk_buff *skb);
 154static int call_netdevice_notifiers_info(unsigned long val,
 155                                         struct net_device *dev,
 156                                         struct netdev_notifier_info *info);
 157
 158/*
 159 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 160 * semaphore.
 161 *
 162 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 163 *
 164 * Writers must hold the rtnl semaphore while they loop through the
 165 * dev_base_head list, and hold dev_base_lock for writing when they do the
 166 * actual updates.  This allows pure readers to access the list even
 167 * while a writer is preparing to update it.
 168 *
 169 * To put it another way, dev_base_lock is held for writing only to
 170 * protect against pure readers; the rtnl semaphore provides the
 171 * protection against other writers.
 172 *
 173 * See, for example usages, register_netdevice() and
 174 * unregister_netdevice(), which must be called with the rtnl
 175 * semaphore held.
 176 */
 177DEFINE_RWLOCK(dev_base_lock);
 178EXPORT_SYMBOL(dev_base_lock);
 179
 180/* protects napi_hash addition/deletion and napi_gen_id */
 181static DEFINE_SPINLOCK(napi_hash_lock);
 182
 183static unsigned int napi_gen_id;
 184static DEFINE_HASHTABLE(napi_hash, 8);
 185
 186static seqcount_t devnet_rename_seq;
 187
 188static inline void dev_base_seq_inc(struct net *net)
 189{
 190        while (++net->dev_base_seq == 0);
 191}
 192
 193static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 194{
 195        unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 196
 197        return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 198}
 199
 200static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 201{
 202        return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 203}
 204
 205static inline void rps_lock(struct softnet_data *sd)
 206{
 207#ifdef CONFIG_RPS
 208        spin_lock(&sd->input_pkt_queue.lock);
 209#endif
 210}
 211
 212static inline void rps_unlock(struct softnet_data *sd)
 213{
 214#ifdef CONFIG_RPS
 215        spin_unlock(&sd->input_pkt_queue.lock);
 216#endif
 217}
 218
 219/* Device list insertion */
 220static void list_netdevice(struct net_device *dev)
 221{
 222        struct net *net = dev_net(dev);
 223
 224        ASSERT_RTNL();
 225
 226        write_lock_bh(&dev_base_lock);
 227        list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 228        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 229        hlist_add_head_rcu(&dev->index_hlist,
 230                           dev_index_hash(net, dev->ifindex));
 231        write_unlock_bh(&dev_base_lock);
 232
 233        dev_base_seq_inc(net);
 234}
 235
 236/* Device list removal
 237 * caller must respect a RCU grace period before freeing/reusing dev
 238 */
 239static void unlist_netdevice(struct net_device *dev)
 240{
 241        ASSERT_RTNL();
 242
 243        /* Unlink dev from the device chain */
 244        write_lock_bh(&dev_base_lock);
 245        list_del_rcu(&dev->dev_list);
 246        hlist_del_rcu(&dev->name_hlist);
 247        hlist_del_rcu(&dev->index_hlist);
 248        write_unlock_bh(&dev_base_lock);
 249
 250        dev_base_seq_inc(dev_net(dev));
 251}
 252
 253/*
 254 *      Our notifier list
 255 */
 256
 257static RAW_NOTIFIER_HEAD(netdev_chain);
 258
 259/*
 260 *      Device drivers call our routines to queue packets here. We empty the
 261 *      queue in the local softnet handler.
 262 */
 263
 264DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 265EXPORT_PER_CPU_SYMBOL(softnet_data);
 266
 267#ifdef CONFIG_LOCKDEP
 268/*
 269 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 270 * according to dev->type
 271 */
 272static const unsigned short netdev_lock_type[] =
 273        {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 274         ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 275         ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 276         ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 277         ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 278         ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 279         ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 280         ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 281         ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 282         ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 283         ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 284         ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 285         ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 286         ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 287         ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 288
 289static const char *const netdev_lock_name[] =
 290        {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 291         "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 292         "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 293         "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 294         "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 295         "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 296         "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 297         "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 298         "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 299         "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 300         "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 301         "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 302         "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 303         "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 304         "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 305
 306static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 307static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 308
 309static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 310{
 311        int i;
 312
 313        for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 314                if (netdev_lock_type[i] == dev_type)
 315                        return i;
 316        /* the last key is used by default */
 317        return ARRAY_SIZE(netdev_lock_type) - 1;
 318}
 319
 320static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 321                                                 unsigned short dev_type)
 322{
 323        int i;
 324
 325        i = netdev_lock_pos(dev_type);
 326        lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 327                                   netdev_lock_name[i]);
 328}
 329
 330static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 331{
 332        int i;
 333
 334        i = netdev_lock_pos(dev->type);
 335        lockdep_set_class_and_name(&dev->addr_list_lock,
 336                                   &netdev_addr_lock_key[i],
 337                                   netdev_lock_name[i]);
 338}
 339#else
 340static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 341                                                 unsigned short dev_type)
 342{
 343}
 344static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 345{
 346}
 347#endif
 348
 349/*******************************************************************************
 350
 351                Protocol management and registration routines
 352
 353*******************************************************************************/
 354
 355/*
 356 *      Add a protocol ID to the list. Now that the input handler is
 357 *      smarter we can dispense with all the messy stuff that used to be
 358 *      here.
 359 *
 360 *      BEWARE!!! Protocol handlers, mangling input packets,
 361 *      MUST BE last in hash buckets and checking protocol handlers
 362 *      MUST start from promiscuous ptype_all chain in net_bh.
 363 *      It is true now, do not change it.
 364 *      Explanation follows: if protocol handler, mangling packet, will
 365 *      be the first on list, it is not able to sense, that packet
 366 *      is cloned and should be copied-on-write, so that it will
 367 *      change it and subsequent readers will get broken packet.
 368 *                                                      --ANK (980803)
 369 */
 370
 371static inline struct list_head *ptype_head(const struct packet_type *pt)
 372{
 373        if (pt->type == htons(ETH_P_ALL))
 374                return &ptype_all;
 375        else
 376                return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 377}
 378
 379/**
 380 *      dev_add_pack - add packet handler
 381 *      @pt: packet type declaration
 382 *
 383 *      Add a protocol handler to the networking stack. The passed &packet_type
 384 *      is linked into kernel lists and may not be freed until it has been
 385 *      removed from the kernel lists.
 386 *
 387 *      This call does not sleep therefore it can not
 388 *      guarantee all CPU's that are in middle of receiving packets
 389 *      will see the new packet type (until the next received packet).
 390 */
 391
 392void dev_add_pack(struct packet_type *pt)
 393{
 394        struct list_head *head = ptype_head(pt);
 395
 396        spin_lock(&ptype_lock);
 397        list_add_rcu(&pt->list, head);
 398        spin_unlock(&ptype_lock);
 399}
 400EXPORT_SYMBOL(dev_add_pack);
 401
 402/**
 403 *      __dev_remove_pack        - remove packet handler
 404 *      @pt: packet type declaration
 405 *
 406 *      Remove a protocol handler that was previously added to the kernel
 407 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 408 *      from the kernel lists and can be freed or reused once this function
 409 *      returns.
 410 *
 411 *      The packet type might still be in use by receivers
 412 *      and must not be freed until after all the CPU's have gone
 413 *      through a quiescent state.
 414 */
 415void __dev_remove_pack(struct packet_type *pt)
 416{
 417        struct list_head *head = ptype_head(pt);
 418        struct packet_type *pt1;
 419
 420        spin_lock(&ptype_lock);
 421
 422        list_for_each_entry(pt1, head, list) {
 423                if (pt == pt1) {
 424                        list_del_rcu(&pt->list);
 425                        goto out;
 426                }
 427        }
 428
 429        pr_warn("dev_remove_pack: %p not found\n", pt);
 430out:
 431        spin_unlock(&ptype_lock);
 432}
 433EXPORT_SYMBOL(__dev_remove_pack);
 434
 435/**
 436 *      dev_remove_pack  - remove packet handler
 437 *      @pt: packet type declaration
 438 *
 439 *      Remove a protocol handler that was previously added to the kernel
 440 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 441 *      from the kernel lists and can be freed or reused once this function
 442 *      returns.
 443 *
 444 *      This call sleeps to guarantee that no CPU is looking at the packet
 445 *      type after return.
 446 */
 447void dev_remove_pack(struct packet_type *pt)
 448{
 449        __dev_remove_pack(pt);
 450
 451        synchronize_net();
 452}
 453EXPORT_SYMBOL(dev_remove_pack);
 454
 455
 456/**
 457 *      dev_add_offload - register offload handlers
 458 *      @po: protocol offload declaration
 459 *
 460 *      Add protocol offload handlers to the networking stack. The passed
 461 *      &proto_offload is linked into kernel lists and may not be freed until
 462 *      it has been removed from the kernel lists.
 463 *
 464 *      This call does not sleep therefore it can not
 465 *      guarantee all CPU's that are in middle of receiving packets
 466 *      will see the new offload handlers (until the next received packet).
 467 */
 468void dev_add_offload(struct packet_offload *po)
 469{
 470        struct list_head *head = &offload_base;
 471
 472        spin_lock(&offload_lock);
 473        list_add_rcu(&po->list, head);
 474        spin_unlock(&offload_lock);
 475}
 476EXPORT_SYMBOL(dev_add_offload);
 477
 478/**
 479 *      __dev_remove_offload     - remove offload handler
 480 *      @po: packet offload declaration
 481 *
 482 *      Remove a protocol offload handler that was previously added to the
 483 *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 484 *      is removed from the kernel lists and can be freed or reused once this
 485 *      function returns.
 486 *
 487 *      The packet type might still be in use by receivers
 488 *      and must not be freed until after all the CPU's have gone
 489 *      through a quiescent state.
 490 */
 491static void __dev_remove_offload(struct packet_offload *po)
 492{
 493        struct list_head *head = &offload_base;
 494        struct packet_offload *po1;
 495
 496        spin_lock(&offload_lock);
 497
 498        list_for_each_entry(po1, head, list) {
 499                if (po == po1) {
 500                        list_del_rcu(&po->list);
 501                        goto out;
 502                }
 503        }
 504
 505        pr_warn("dev_remove_offload: %p not found\n", po);
 506out:
 507        spin_unlock(&offload_lock);
 508}
 509
 510/**
 511 *      dev_remove_offload       - remove packet offload handler
 512 *      @po: packet offload declaration
 513 *
 514 *      Remove a packet offload handler that was previously added to the kernel
 515 *      offload handlers by dev_add_offload(). The passed &offload_type is
 516 *      removed from the kernel lists and can be freed or reused once this
 517 *      function returns.
 518 *
 519 *      This call sleeps to guarantee that no CPU is looking at the packet
 520 *      type after return.
 521 */
 522void dev_remove_offload(struct packet_offload *po)
 523{
 524        __dev_remove_offload(po);
 525
 526        synchronize_net();
 527}
 528EXPORT_SYMBOL(dev_remove_offload);
 529
 530/******************************************************************************
 531
 532                      Device Boot-time Settings Routines
 533
 534*******************************************************************************/
 535
 536/* Boot time configuration table */
 537static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 538
 539/**
 540 *      netdev_boot_setup_add   - add new setup entry
 541 *      @name: name of the device
 542 *      @map: configured settings for the device
 543 *
 544 *      Adds new setup entry to the dev_boot_setup list.  The function
 545 *      returns 0 on error and 1 on success.  This is a generic routine to
 546 *      all netdevices.
 547 */
 548static int netdev_boot_setup_add(char *name, struct ifmap *map)
 549{
 550        struct netdev_boot_setup *s;
 551        int i;
 552
 553        s = dev_boot_setup;
 554        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 555                if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 556                        memset(s[i].name, 0, sizeof(s[i].name));
 557                        strlcpy(s[i].name, name, IFNAMSIZ);
 558                        memcpy(&s[i].map, map, sizeof(s[i].map));
 559                        break;
 560                }
 561        }
 562
 563        return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 564}
 565
 566/**
 567 *      netdev_boot_setup_check - check boot time settings
 568 *      @dev: the netdevice
 569 *
 570 *      Check boot time settings for the device.
 571 *      The found settings are set for the device to be used
 572 *      later in the device probing.
 573 *      Returns 0 if no settings found, 1 if they are.
 574 */
 575int netdev_boot_setup_check(struct net_device *dev)
 576{
 577        struct netdev_boot_setup *s = dev_boot_setup;
 578        int i;
 579
 580        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 581                if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 582                    !strcmp(dev->name, s[i].name)) {
 583                        dev->irq        = s[i].map.irq;
 584                        dev->base_addr  = s[i].map.base_addr;
 585                        dev->mem_start  = s[i].map.mem_start;
 586                        dev->mem_end    = s[i].map.mem_end;
 587                        return 1;
 588                }
 589        }
 590        return 0;
 591}
 592EXPORT_SYMBOL(netdev_boot_setup_check);
 593
 594
 595/**
 596 *      netdev_boot_base        - get address from boot time settings
 597 *      @prefix: prefix for network device
 598 *      @unit: id for network device
 599 *
 600 *      Check boot time settings for the base address of device.
 601 *      The found settings are set for the device to be used
 602 *      later in the device probing.
 603 *      Returns 0 if no settings found.
 604 */
 605unsigned long netdev_boot_base(const char *prefix, int unit)
 606{
 607        const struct netdev_boot_setup *s = dev_boot_setup;
 608        char name[IFNAMSIZ];
 609        int i;
 610
 611        sprintf(name, "%s%d", prefix, unit);
 612
 613        /*
 614         * If device already registered then return base of 1
 615         * to indicate not to probe for this interface
 616         */
 617        if (__dev_get_by_name(&init_net, name))
 618                return 1;
 619
 620        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 621                if (!strcmp(name, s[i].name))
 622                        return s[i].map.base_addr;
 623        return 0;
 624}
 625
 626/*
 627 * Saves at boot time configured settings for any netdevice.
 628 */
 629int __init netdev_boot_setup(char *str)
 630{
 631        int ints[5];
 632        struct ifmap map;
 633
 634        str = get_options(str, ARRAY_SIZE(ints), ints);
 635        if (!str || !*str)
 636                return 0;
 637
 638        /* Save settings */
 639        memset(&map, 0, sizeof(map));
 640        if (ints[0] > 0)
 641                map.irq = ints[1];
 642        if (ints[0] > 1)
 643                map.base_addr = ints[2];
 644        if (ints[0] > 2)
 645                map.mem_start = ints[3];
 646        if (ints[0] > 3)
 647                map.mem_end = ints[4];
 648
 649        /* Add new entry to the list */
 650        return netdev_boot_setup_add(str, &map);
 651}
 652
 653__setup("netdev=", netdev_boot_setup);
 654
 655/*******************************************************************************
 656
 657                            Device Interface Subroutines
 658
 659*******************************************************************************/
 660
 661/**
 662 *      __dev_get_by_name       - find a device by its name
 663 *      @net: the applicable net namespace
 664 *      @name: name to find
 665 *
 666 *      Find an interface by name. Must be called under RTNL semaphore
 667 *      or @dev_base_lock. If the name is found a pointer to the device
 668 *      is returned. If the name is not found then %NULL is returned. The
 669 *      reference counters are not incremented so the caller must be
 670 *      careful with locks.
 671 */
 672
 673struct net_device *__dev_get_by_name(struct net *net, const char *name)
 674{
 675        struct net_device *dev;
 676        struct hlist_head *head = dev_name_hash(net, name);
 677
 678        hlist_for_each_entry(dev, head, name_hlist)
 679                if (!strncmp(dev->name, name, IFNAMSIZ))
 680                        return dev;
 681
 682        return NULL;
 683}
 684EXPORT_SYMBOL(__dev_get_by_name);
 685
 686/**
 687 *      dev_get_by_name_rcu     - find a device by its name
 688 *      @net: the applicable net namespace
 689 *      @name: name to find
 690 *
 691 *      Find an interface by name.
 692 *      If the name is found a pointer to the device is returned.
 693 *      If the name is not found then %NULL is returned.
 694 *      The reference counters are not incremented so the caller must be
 695 *      careful with locks. The caller must hold RCU lock.
 696 */
 697
 698struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 699{
 700        struct net_device *dev;
 701        struct hlist_head *head = dev_name_hash(net, name);
 702
 703        hlist_for_each_entry_rcu(dev, head, name_hlist)
 704                if (!strncmp(dev->name, name, IFNAMSIZ))
 705                        return dev;
 706
 707        return NULL;
 708}
 709EXPORT_SYMBOL(dev_get_by_name_rcu);
 710
 711/**
 712 *      dev_get_by_name         - find a device by its name
 713 *      @net: the applicable net namespace
 714 *      @name: name to find
 715 *
 716 *      Find an interface by name. This can be called from any
 717 *      context and does its own locking. The returned handle has
 718 *      the usage count incremented and the caller must use dev_put() to
 719 *      release it when it is no longer needed. %NULL is returned if no
 720 *      matching device is found.
 721 */
 722
 723struct net_device *dev_get_by_name(struct net *net, const char *name)
 724{
 725        struct net_device *dev;
 726
 727        rcu_read_lock();
 728        dev = dev_get_by_name_rcu(net, name);
 729        if (dev)
 730                dev_hold(dev);
 731        rcu_read_unlock();
 732        return dev;
 733}
 734EXPORT_SYMBOL(dev_get_by_name);
 735
 736/**
 737 *      __dev_get_by_index - find a device by its ifindex
 738 *      @net: the applicable net namespace
 739 *      @ifindex: index of device
 740 *
 741 *      Search for an interface by index. Returns %NULL if the device
 742 *      is not found or a pointer to the device. The device has not
 743 *      had its reference counter increased so the caller must be careful
 744 *      about locking. The caller must hold either the RTNL semaphore
 745 *      or @dev_base_lock.
 746 */
 747
 748struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 749{
 750        struct net_device *dev;
 751        struct hlist_head *head = dev_index_hash(net, ifindex);
 752
 753        hlist_for_each_entry(dev, head, index_hlist)
 754                if (dev->ifindex == ifindex)
 755                        return dev;
 756
 757        return NULL;
 758}
 759EXPORT_SYMBOL(__dev_get_by_index);
 760
 761/**
 762 *      dev_get_by_index_rcu - find a device by its ifindex
 763 *      @net: the applicable net namespace
 764 *      @ifindex: index of device
 765 *
 766 *      Search for an interface by index. Returns %NULL if the device
 767 *      is not found or a pointer to the device. The device has not
 768 *      had its reference counter increased so the caller must be careful
 769 *      about locking. The caller must hold RCU lock.
 770 */
 771
 772struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 773{
 774        struct net_device *dev;
 775        struct hlist_head *head = dev_index_hash(net, ifindex);
 776
 777        hlist_for_each_entry_rcu(dev, head, index_hlist)
 778                if (dev->ifindex == ifindex)
 779                        return dev;
 780
 781        return NULL;
 782}
 783EXPORT_SYMBOL(dev_get_by_index_rcu);
 784
 785
 786/**
 787 *      dev_get_by_index - find a device by its ifindex
 788 *      @net: the applicable net namespace
 789 *      @ifindex: index of device
 790 *
 791 *      Search for an interface by index. Returns NULL if the device
 792 *      is not found or a pointer to the device. The device returned has
 793 *      had a reference added and the pointer is safe until the user calls
 794 *      dev_put to indicate they have finished with it.
 795 */
 796
 797struct net_device *dev_get_by_index(struct net *net, int ifindex)
 798{
 799        struct net_device *dev;
 800
 801        rcu_read_lock();
 802        dev = dev_get_by_index_rcu(net, ifindex);
 803        if (dev)
 804                dev_hold(dev);
 805        rcu_read_unlock();
 806        return dev;
 807}
 808EXPORT_SYMBOL(dev_get_by_index);
 809
 810/**
 811 *      netdev_get_name - get a netdevice name, knowing its ifindex.
 812 *      @net: network namespace
 813 *      @name: a pointer to the buffer where the name will be stored.
 814 *      @ifindex: the ifindex of the interface to get the name from.
 815 *
 816 *      The use of raw_seqcount_begin() and cond_resched() before
 817 *      retrying is required as we want to give the writers a chance
 818 *      to complete when CONFIG_PREEMPT is not set.
 819 */
 820int netdev_get_name(struct net *net, char *name, int ifindex)
 821{
 822        struct net_device *dev;
 823        unsigned int seq;
 824
 825retry:
 826        seq = raw_seqcount_begin(&devnet_rename_seq);
 827        rcu_read_lock();
 828        dev = dev_get_by_index_rcu(net, ifindex);
 829        if (!dev) {
 830                rcu_read_unlock();
 831                return -ENODEV;
 832        }
 833
 834        strcpy(name, dev->name);
 835        rcu_read_unlock();
 836        if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 837                cond_resched();
 838                goto retry;
 839        }
 840
 841        return 0;
 842}
 843
 844/**
 845 *      dev_getbyhwaddr_rcu - find a device by its hardware address
 846 *      @net: the applicable net namespace
 847 *      @type: media type of device
 848 *      @ha: hardware address
 849 *
 850 *      Search for an interface by MAC address. Returns NULL if the device
 851 *      is not found or a pointer to the device.
 852 *      The caller must hold RCU or RTNL.
 853 *      The returned device has not had its ref count increased
 854 *      and the caller must therefore be careful about locking
 855 *
 856 */
 857
 858struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 859                                       const char *ha)
 860{
 861        struct net_device *dev;
 862
 863        for_each_netdev_rcu(net, dev)
 864                if (dev->type == type &&
 865                    !memcmp(dev->dev_addr, ha, dev->addr_len))
 866                        return dev;
 867
 868        return NULL;
 869}
 870EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 871
 872struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 873{
 874        struct net_device *dev;
 875
 876        ASSERT_RTNL();
 877        for_each_netdev(net, dev)
 878                if (dev->type == type)
 879                        return dev;
 880
 881        return NULL;
 882}
 883EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 884
 885struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 886{
 887        struct net_device *dev, *ret = NULL;
 888
 889        rcu_read_lock();
 890        for_each_netdev_rcu(net, dev)
 891                if (dev->type == type) {
 892                        dev_hold(dev);
 893                        ret = dev;
 894                        break;
 895                }
 896        rcu_read_unlock();
 897        return ret;
 898}
 899EXPORT_SYMBOL(dev_getfirstbyhwtype);
 900
 901/**
 902 *      __dev_get_by_flags - find any device with given flags
 903 *      @net: the applicable net namespace
 904 *      @if_flags: IFF_* values
 905 *      @mask: bitmask of bits in if_flags to check
 906 *
 907 *      Search for any interface with the given flags. Returns NULL if a device
 908 *      is not found or a pointer to the device. Must be called inside
 909 *      rtnl_lock(), and result refcount is unchanged.
 910 */
 911
 912struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 913                                      unsigned short mask)
 914{
 915        struct net_device *dev, *ret;
 916
 917        ASSERT_RTNL();
 918
 919        ret = NULL;
 920        for_each_netdev(net, dev) {
 921                if (((dev->flags ^ if_flags) & mask) == 0) {
 922                        ret = dev;
 923                        break;
 924                }
 925        }
 926        return ret;
 927}
 928EXPORT_SYMBOL(__dev_get_by_flags);
 929
 930/**
 931 *      dev_valid_name - check if name is okay for network device
 932 *      @name: name string
 933 *
 934 *      Network device names need to be valid file names to
 935 *      to allow sysfs to work.  We also disallow any kind of
 936 *      whitespace.
 937 */
 938bool dev_valid_name(const char *name)
 939{
 940        if (*name == '\0')
 941                return false;
 942        if (strlen(name) >= IFNAMSIZ)
 943                return false;
 944        if (!strcmp(name, ".") || !strcmp(name, ".."))
 945                return false;
 946
 947        while (*name) {
 948                if (*name == '/' || isspace(*name))
 949                        return false;
 950                name++;
 951        }
 952        return true;
 953}
 954EXPORT_SYMBOL(dev_valid_name);
 955
 956/**
 957 *      __dev_alloc_name - allocate a name for a device
 958 *      @net: network namespace to allocate the device name in
 959 *      @name: name format string
 960 *      @buf:  scratch buffer and result name string
 961 *
 962 *      Passed a format string - eg "lt%d" it will try and find a suitable
 963 *      id. It scans list of devices to build up a free map, then chooses
 964 *      the first empty slot. The caller must hold the dev_base or rtnl lock
 965 *      while allocating the name and adding the device in order to avoid
 966 *      duplicates.
 967 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 968 *      Returns the number of the unit assigned or a negative errno code.
 969 */
 970
 971static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 972{
 973        int i = 0;
 974        const char *p;
 975        const int max_netdevices = 8*PAGE_SIZE;
 976        unsigned long *inuse;
 977        struct net_device *d;
 978
 979        p = strnchr(name, IFNAMSIZ-1, '%');
 980        if (p) {
 981                /*
 982                 * Verify the string as this thing may have come from
 983                 * the user.  There must be either one "%d" and no other "%"
 984                 * characters.
 985                 */
 986                if (p[1] != 'd' || strchr(p + 2, '%'))
 987                        return -EINVAL;
 988
 989                /* Use one page as a bit array of possible slots */
 990                inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 991                if (!inuse)
 992                        return -ENOMEM;
 993
 994                for_each_netdev(net, d) {
 995                        if (!sscanf(d->name, name, &i))
 996                                continue;
 997                        if (i < 0 || i >= max_netdevices)
 998                                continue;
 999
1000                        /*  avoid cases where sscanf is not exact inverse of printf */
1001                        snprintf(buf, IFNAMSIZ, name, i);
1002                        if (!strncmp(buf, d->name, IFNAMSIZ))
1003                                set_bit(i, inuse);
1004                }
1005
1006                i = find_first_zero_bit(inuse, max_netdevices);
1007                free_page((unsigned long) inuse);
1008        }
1009
1010        if (buf != name)
1011                snprintf(buf, IFNAMSIZ, name, i);
1012        if (!__dev_get_by_name(net, buf))
1013                return i;
1014
1015        /* It is possible to run out of possible slots
1016         * when the name is long and there isn't enough space left
1017         * for the digits, or if all bits are used.
1018         */
1019        return -ENFILE;
1020}
1021
1022/**
1023 *      dev_alloc_name - allocate a name for a device
1024 *      @dev: device
1025 *      @name: name format string
1026 *
1027 *      Passed a format string - eg "lt%d" it will try and find a suitable
1028 *      id. It scans list of devices to build up a free map, then chooses
1029 *      the first empty slot. The caller must hold the dev_base or rtnl lock
1030 *      while allocating the name and adding the device in order to avoid
1031 *      duplicates.
1032 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1033 *      Returns the number of the unit assigned or a negative errno code.
1034 */
1035
1036int dev_alloc_name(struct net_device *dev, const char *name)
1037{
1038        char buf[IFNAMSIZ];
1039        struct net *net;
1040        int ret;
1041
1042        BUG_ON(!dev_net(dev));
1043        net = dev_net(dev);
1044        ret = __dev_alloc_name(net, name, buf);
1045        if (ret >= 0)
1046                strlcpy(dev->name, buf, IFNAMSIZ);
1047        return ret;
1048}
1049EXPORT_SYMBOL(dev_alloc_name);
1050
1051static int dev_alloc_name_ns(struct net *net,
1052                             struct net_device *dev,
1053                             const char *name)
1054{
1055        char buf[IFNAMSIZ];
1056        int ret;
1057
1058        ret = __dev_alloc_name(net, name, buf);
1059        if (ret >= 0)
1060                strlcpy(dev->name, buf, IFNAMSIZ);
1061        return ret;
1062}
1063
1064static int dev_get_valid_name(struct net *net,
1065                              struct net_device *dev,
1066                              const char *name)
1067{
1068        BUG_ON(!net);
1069
1070        if (!dev_valid_name(name))
1071                return -EINVAL;
1072
1073        if (strchr(name, '%'))
1074                return dev_alloc_name_ns(net, dev, name);
1075        else if (__dev_get_by_name(net, name))
1076                return -EEXIST;
1077        else if (dev->name != name)
1078                strlcpy(dev->name, name, IFNAMSIZ);
1079
1080        return 0;
1081}
1082
1083/**
1084 *      dev_change_name - change name of a device
1085 *      @dev: device
1086 *      @newname: name (or format string) must be at least IFNAMSIZ
1087 *
1088 *      Change name of a device, can pass format strings "eth%d".
1089 *      for wildcarding.
1090 */
1091int dev_change_name(struct net_device *dev, const char *newname)
1092{
1093        unsigned char old_assign_type;
1094        char oldname[IFNAMSIZ];
1095        int err = 0;
1096        int ret;
1097        struct net *net;
1098
1099        ASSERT_RTNL();
1100        BUG_ON(!dev_net(dev));
1101
1102        net = dev_net(dev);
1103        if (dev->flags & IFF_UP)
1104                return -EBUSY;
1105
1106        write_seqcount_begin(&devnet_rename_seq);
1107
1108        if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1109                write_seqcount_end(&devnet_rename_seq);
1110                return 0;
1111        }
1112
1113        memcpy(oldname, dev->name, IFNAMSIZ);
1114
1115        err = dev_get_valid_name(net, dev, newname);
1116        if (err < 0) {
1117                write_seqcount_end(&devnet_rename_seq);
1118                return err;
1119        }
1120
1121        if (oldname[0] && !strchr(oldname, '%'))
1122                netdev_info(dev, "renamed from %s\n", oldname);
1123
1124        old_assign_type = dev->name_assign_type;
1125        dev->name_assign_type = NET_NAME_RENAMED;
1126
1127rollback:
1128        ret = device_rename(&dev->dev, dev->name);
1129        if (ret) {
1130                memcpy(dev->name, oldname, IFNAMSIZ);
1131                dev->name_assign_type = old_assign_type;
1132                write_seqcount_end(&devnet_rename_seq);
1133                return ret;
1134        }
1135
1136        write_seqcount_end(&devnet_rename_seq);
1137
1138        netdev_adjacent_rename_links(dev, oldname);
1139
1140        write_lock_bh(&dev_base_lock);
1141        hlist_del_rcu(&dev->name_hlist);
1142        write_unlock_bh(&dev_base_lock);
1143
1144        synchronize_rcu();
1145
1146        write_lock_bh(&dev_base_lock);
1147        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1148        write_unlock_bh(&dev_base_lock);
1149
1150        ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1151        ret = notifier_to_errno(ret);
1152
1153        if (ret) {
1154                /* err >= 0 after dev_alloc_name() or stores the first errno */
1155                if (err >= 0) {
1156                        err = ret;
1157                        write_seqcount_begin(&devnet_rename_seq);
1158                        memcpy(dev->name, oldname, IFNAMSIZ);
1159                        memcpy(oldname, newname, IFNAMSIZ);
1160                        dev->name_assign_type = old_assign_type;
1161                        old_assign_type = NET_NAME_RENAMED;
1162                        goto rollback;
1163                } else {
1164                        pr_err("%s: name change rollback failed: %d\n",
1165                               dev->name, ret);
1166                }
1167        }
1168
1169        return err;
1170}
1171
1172/**
1173 *      dev_set_alias - change ifalias of a device
1174 *      @dev: device
1175 *      @alias: name up to IFALIASZ
1176 *      @len: limit of bytes to copy from info
1177 *
1178 *      Set ifalias for a device,
1179 */
1180int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1181{
1182        char *new_ifalias;
1183
1184        ASSERT_RTNL();
1185
1186        if (len >= IFALIASZ)
1187                return -EINVAL;
1188
1189        if (!len) {
1190                kfree(dev->ifalias);
1191                dev->ifalias = NULL;
1192                return 0;
1193        }
1194
1195        new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1196        if (!new_ifalias)
1197                return -ENOMEM;
1198        dev->ifalias = new_ifalias;
1199
1200        strlcpy(dev->ifalias, alias, len+1);
1201        return len;
1202}
1203
1204
1205/**
1206 *      netdev_features_change - device changes features
1207 *      @dev: device to cause notification
1208 *
1209 *      Called to indicate a device has changed features.
1210 */
1211void netdev_features_change(struct net_device *dev)
1212{
1213        call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1214}
1215EXPORT_SYMBOL(netdev_features_change);
1216
1217/**
1218 *      netdev_state_change - device changes state
1219 *      @dev: device to cause notification
1220 *
1221 *      Called to indicate a device has changed state. This function calls
1222 *      the notifier chains for netdev_chain and sends a NEWLINK message
1223 *      to the routing socket.
1224 */
1225void netdev_state_change(struct net_device *dev)
1226{
1227        if (dev->flags & IFF_UP) {
1228                struct netdev_notifier_change_info change_info;
1229
1230                change_info.flags_changed = 0;
1231                call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1232                                              &change_info.info);
1233                rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1234        }
1235}
1236EXPORT_SYMBOL(netdev_state_change);
1237
1238/**
1239 *      netdev_notify_peers - notify network peers about existence of @dev
1240 *      @dev: network device
1241 *
1242 * Generate traffic such that interested network peers are aware of
1243 * @dev, such as by generating a gratuitous ARP. This may be used when
1244 * a device wants to inform the rest of the network about some sort of
1245 * reconfiguration such as a failover event or virtual machine
1246 * migration.
1247 */
1248void netdev_notify_peers(struct net_device *dev)
1249{
1250        rtnl_lock();
1251        call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1252        rtnl_unlock();
1253}
1254EXPORT_SYMBOL(netdev_notify_peers);
1255
1256static int __dev_open(struct net_device *dev)
1257{
1258        const struct net_device_ops *ops = dev->netdev_ops;
1259        int ret;
1260
1261        ASSERT_RTNL();
1262
1263        if (!netif_device_present(dev))
1264                return -ENODEV;
1265
1266        /* Block netpoll from trying to do any rx path servicing.
1267         * If we don't do this there is a chance ndo_poll_controller
1268         * or ndo_poll may be running while we open the device
1269         */
1270        netpoll_poll_disable(dev);
1271
1272        ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1273        ret = notifier_to_errno(ret);
1274        if (ret)
1275                return ret;
1276
1277        set_bit(__LINK_STATE_START, &dev->state);
1278
1279        if (ops->ndo_validate_addr)
1280                ret = ops->ndo_validate_addr(dev);
1281
1282        if (!ret && ops->ndo_open)
1283                ret = ops->ndo_open(dev);
1284
1285        netpoll_poll_enable(dev);
1286
1287        if (ret)
1288                clear_bit(__LINK_STATE_START, &dev->state);
1289        else {
1290                dev->flags |= IFF_UP;
1291                dev_set_rx_mode(dev);
1292                dev_activate(dev);
1293                add_device_randomness(dev->dev_addr, dev->addr_len);
1294        }
1295
1296        return ret;
1297}
1298
1299/**
1300 *      dev_open        - prepare an interface for use.
1301 *      @dev:   device to open
1302 *
1303 *      Takes a device from down to up state. The device's private open
1304 *      function is invoked and then the multicast lists are loaded. Finally
1305 *      the device is moved into the up state and a %NETDEV_UP message is
1306 *      sent to the netdev notifier chain.
1307 *
1308 *      Calling this function on an active interface is a nop. On a failure
1309 *      a negative errno code is returned.
1310 */
1311int dev_open(struct net_device *dev)
1312{
1313        int ret;
1314
1315        if (dev->flags & IFF_UP)
1316                return 0;
1317
1318        ret = __dev_open(dev);
1319        if (ret < 0)
1320                return ret;
1321
1322        rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1323        call_netdevice_notifiers(NETDEV_UP, dev);
1324
1325        return ret;
1326}
1327EXPORT_SYMBOL(dev_open);
1328
1329static int __dev_close_many(struct list_head *head)
1330{
1331        struct net_device *dev;
1332
1333        ASSERT_RTNL();
1334        might_sleep();
1335
1336        list_for_each_entry(dev, head, close_list) {
1337                /* Temporarily disable netpoll until the interface is down */
1338                netpoll_poll_disable(dev);
1339
1340                call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1341
1342                clear_bit(__LINK_STATE_START, &dev->state);
1343
1344                /* Synchronize to scheduled poll. We cannot touch poll list, it
1345                 * can be even on different cpu. So just clear netif_running().
1346                 *
1347                 * dev->stop() will invoke napi_disable() on all of it's
1348                 * napi_struct instances on this device.
1349                 */
1350                smp_mb__after_atomic(); /* Commit netif_running(). */
1351        }
1352
1353        dev_deactivate_many(head);
1354
1355        list_for_each_entry(dev, head, close_list) {
1356                const struct net_device_ops *ops = dev->netdev_ops;
1357
1358                /*
1359                 *      Call the device specific close. This cannot fail.
1360                 *      Only if device is UP
1361                 *
1362                 *      We allow it to be called even after a DETACH hot-plug
1363                 *      event.
1364                 */
1365                if (ops->ndo_stop)
1366                        ops->ndo_stop(dev);
1367
1368                dev->flags &= ~IFF_UP;
1369                netpoll_poll_enable(dev);
1370        }
1371
1372        return 0;
1373}
1374
1375static int __dev_close(struct net_device *dev)
1376{
1377        int retval;
1378        LIST_HEAD(single);
1379
1380        list_add(&dev->close_list, &single);
1381        retval = __dev_close_many(&single);
1382        list_del(&single);
1383
1384        return retval;
1385}
1386
1387static int dev_close_many(struct list_head *head)
1388{
1389        struct net_device *dev, *tmp;
1390
1391        /* Remove the devices that don't need to be closed */
1392        list_for_each_entry_safe(dev, tmp, head, close_list)
1393                if (!(dev->flags & IFF_UP))
1394                        list_del_init(&dev->close_list);
1395
1396        __dev_close_many(head);
1397
1398        list_for_each_entry_safe(dev, tmp, head, close_list) {
1399                rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1400                call_netdevice_notifiers(NETDEV_DOWN, dev);
1401                list_del_init(&dev->close_list);
1402        }
1403
1404        return 0;
1405}
1406
1407/**
1408 *      dev_close - shutdown an interface.
1409 *      @dev: device to shutdown
1410 *
1411 *      This function moves an active device into down state. A
1412 *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1413 *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1414 *      chain.
1415 */
1416int dev_close(struct net_device *dev)
1417{
1418        if (dev->flags & IFF_UP) {
1419                LIST_HEAD(single);
1420
1421                list_add(&dev->close_list, &single);
1422                dev_close_many(&single);
1423                list_del(&single);
1424        }
1425        return 0;
1426}
1427EXPORT_SYMBOL(dev_close);
1428
1429
1430/**
1431 *      dev_disable_lro - disable Large Receive Offload on a device
1432 *      @dev: device
1433 *
1434 *      Disable Large Receive Offload (LRO) on a net device.  Must be
1435 *      called under RTNL.  This is needed if received packets may be
1436 *      forwarded to another interface.
1437 */
1438void dev_disable_lro(struct net_device *dev)
1439{
1440        struct net_device *lower_dev;
1441        struct list_head *iter;
1442
1443        dev->wanted_features &= ~NETIF_F_LRO;
1444        netdev_update_features(dev);
1445
1446        if (unlikely(dev->features & NETIF_F_LRO))
1447                netdev_WARN(dev, "failed to disable LRO!\n");
1448
1449        netdev_for_each_lower_dev(dev, lower_dev, iter)
1450                dev_disable_lro(lower_dev);
1451}
1452EXPORT_SYMBOL(dev_disable_lro);
1453
1454static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1455                                   struct net_device *dev)
1456{
1457        struct netdev_notifier_info info;
1458
1459        netdev_notifier_info_init(&info, dev);
1460        return nb->notifier_call(nb, val, &info);
1461}
1462
1463static int dev_boot_phase = 1;
1464
1465/**
1466 *      register_netdevice_notifier - register a network notifier block
1467 *      @nb: notifier
1468 *
1469 *      Register a notifier to be called when network device events occur.
1470 *      The notifier passed is linked into the kernel structures and must
1471 *      not be reused until it has been unregistered. A negative errno code
1472 *      is returned on a failure.
1473 *
1474 *      When registered all registration and up events are replayed
1475 *      to the new notifier to allow device to have a race free
1476 *      view of the network device list.
1477 */
1478
1479int register_netdevice_notifier(struct notifier_block *nb)
1480{
1481        struct net_device *dev;
1482        struct net_device *last;
1483        struct net *net;
1484        int err;
1485
1486        rtnl_lock();
1487        err = raw_notifier_chain_register(&netdev_chain, nb);
1488        if (err)
1489                goto unlock;
1490        if (dev_boot_phase)
1491                goto unlock;
1492        for_each_net(net) {
1493                for_each_netdev(net, dev) {
1494                        err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1495                        err = notifier_to_errno(err);
1496                        if (err)
1497                                goto rollback;
1498
1499                        if (!(dev->flags & IFF_UP))
1500                                continue;
1501
1502                        call_netdevice_notifier(nb, NETDEV_UP, dev);
1503                }
1504        }
1505
1506unlock:
1507        rtnl_unlock();
1508        return err;
1509
1510rollback:
1511        last = dev;
1512        for_each_net(net) {
1513                for_each_netdev(net, dev) {
1514                        if (dev == last)
1515                                goto outroll;
1516
1517                        if (dev->flags & IFF_UP) {
1518                                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1519                                                        dev);
1520                                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1521                        }
1522                        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1523                }
1524        }
1525
1526outroll:
1527        raw_notifier_chain_unregister(&netdev_chain, nb);
1528        goto unlock;
1529}
1530EXPORT_SYMBOL(register_netdevice_notifier);
1531
1532/**
1533 *      unregister_netdevice_notifier - unregister a network notifier block
1534 *      @nb: notifier
1535 *
1536 *      Unregister a notifier previously registered by
1537 *      register_netdevice_notifier(). The notifier is unlinked into the
1538 *      kernel structures and may then be reused. A negative errno code
1539 *      is returned on a failure.
1540 *
1541 *      After unregistering unregister and down device events are synthesized
1542 *      for all devices on the device list to the removed notifier to remove
1543 *      the need for special case cleanup code.
1544 */
1545
1546int unregister_netdevice_notifier(struct notifier_block *nb)
1547{
1548        struct net_device *dev;
1549        struct net *net;
1550        int err;
1551
1552        rtnl_lock();
1553        err = raw_notifier_chain_unregister(&netdev_chain, nb);
1554        if (err)
1555                goto unlock;
1556
1557        for_each_net(net) {
1558                for_each_netdev(net, dev) {
1559                        if (dev->flags & IFF_UP) {
1560                                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1561                                                        dev);
1562                                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1563                        }
1564                        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1565                }
1566        }
1567unlock:
1568        rtnl_unlock();
1569        return err;
1570}
1571EXPORT_SYMBOL(unregister_netdevice_notifier);
1572
1573/**
1574 *      call_netdevice_notifiers_info - call all network notifier blocks
1575 *      @val: value passed unmodified to notifier function
1576 *      @dev: net_device pointer passed unmodified to notifier function
1577 *      @info: notifier information data
1578 *
1579 *      Call all network notifier blocks.  Parameters and return value
1580 *      are as for raw_notifier_call_chain().
1581 */
1582
1583static int call_netdevice_notifiers_info(unsigned long val,
1584                                         struct net_device *dev,
1585                                         struct netdev_notifier_info *info)
1586{
1587        ASSERT_RTNL();
1588        netdev_notifier_info_init(info, dev);
1589        return raw_notifier_call_chain(&netdev_chain, val, info);
1590}
1591
1592/**
1593 *      call_netdevice_notifiers - call all network notifier blocks
1594 *      @val: value passed unmodified to notifier function
1595 *      @dev: net_device pointer passed unmodified to notifier function
1596 *
1597 *      Call all network notifier blocks.  Parameters and return value
1598 *      are as for raw_notifier_call_chain().
1599 */
1600
1601int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1602{
1603        struct netdev_notifier_info info;
1604
1605        return call_netdevice_notifiers_info(val, dev, &info);
1606}
1607EXPORT_SYMBOL(call_netdevice_notifiers);
1608
1609static struct static_key netstamp_needed __read_mostly;
1610#ifdef HAVE_JUMP_LABEL
1611/* We are not allowed to call static_key_slow_dec() from irq context
1612 * If net_disable_timestamp() is called from irq context, defer the
1613 * static_key_slow_dec() calls.
1614 */
1615static atomic_t netstamp_needed_deferred;
1616#endif
1617
1618void net_enable_timestamp(void)
1619{
1620#ifdef HAVE_JUMP_LABEL
1621        int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1622
1623        if (deferred) {
1624                while (--deferred)
1625                        static_key_slow_dec(&netstamp_needed);
1626                return;
1627        }
1628#endif
1629        static_key_slow_inc(&netstamp_needed);
1630}
1631EXPORT_SYMBOL(net_enable_timestamp);
1632
1633void net_disable_timestamp(void)
1634{
1635#ifdef HAVE_JUMP_LABEL
1636        if (in_interrupt()) {
1637                atomic_inc(&netstamp_needed_deferred);
1638                return;
1639        }
1640#endif
1641        static_key_slow_dec(&netstamp_needed);
1642}
1643EXPORT_SYMBOL(net_disable_timestamp);
1644
1645static inline void net_timestamp_set(struct sk_buff *skb)
1646{
1647        skb->tstamp.tv64 = 0;
1648        if (static_key_false(&netstamp_needed))
1649                __net_timestamp(skb);
1650}
1651
1652#define net_timestamp_check(COND, SKB)                  \
1653        if (static_key_false(&netstamp_needed)) {               \
1654                if ((COND) && !(SKB)->tstamp.tv64)      \
1655                        __net_timestamp(SKB);           \
1656        }                                               \
1657
1658bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1659{
1660        unsigned int len;
1661
1662        if (!(dev->flags & IFF_UP))
1663                return false;
1664
1665        len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1666        if (skb->len <= len)
1667                return true;
1668
1669        /* if TSO is enabled, we don't care about the length as the packet
1670         * could be forwarded without being segmented before
1671         */
1672        if (skb_is_gso(skb))
1673                return true;
1674
1675        return false;
1676}
1677EXPORT_SYMBOL_GPL(is_skb_forwardable);
1678
1679int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1680{
1681        if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1682                if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1683                        atomic_long_inc(&dev->rx_dropped);
1684                        kfree_skb(skb);
1685                        return NET_RX_DROP;
1686                }
1687        }
1688
1689        if (unlikely(!is_skb_forwardable(dev, skb))) {
1690                atomic_long_inc(&dev->rx_dropped);
1691                kfree_skb(skb);
1692                return NET_RX_DROP;
1693        }
1694
1695        skb_scrub_packet(skb, true);
1696        skb->protocol = eth_type_trans(skb, dev);
1697        skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1698
1699        return 0;
1700}
1701EXPORT_SYMBOL_GPL(__dev_forward_skb);
1702
1703/**
1704 * dev_forward_skb - loopback an skb to another netif
1705 *
1706 * @dev: destination network device
1707 * @skb: buffer to forward
1708 *
1709 * return values:
1710 *      NET_RX_SUCCESS  (no congestion)
1711 *      NET_RX_DROP     (packet was dropped, but freed)
1712 *
1713 * dev_forward_skb can be used for injecting an skb from the
1714 * start_xmit function of one device into the receive queue
1715 * of another device.
1716 *
1717 * The receiving device may be in another namespace, so
1718 * we have to clear all information in the skb that could
1719 * impact namespace isolation.
1720 */
1721int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1722{
1723        return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1724}
1725EXPORT_SYMBOL_GPL(dev_forward_skb);
1726
1727static inline int deliver_skb(struct sk_buff *skb,
1728                              struct packet_type *pt_prev,
1729                              struct net_device *orig_dev)
1730{
1731        if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1732                return -ENOMEM;
1733        atomic_inc(&skb->users);
1734        return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1735}
1736
1737static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1738{
1739        if (!ptype->af_packet_priv || !skb->sk)
1740                return false;
1741
1742        if (ptype->id_match)
1743                return ptype->id_match(ptype, skb->sk);
1744        else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1745                return true;
1746
1747        return false;
1748}
1749
1750/*
1751 *      Support routine. Sends outgoing frames to any network
1752 *      taps currently in use.
1753 */
1754
1755static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1756{
1757        struct packet_type *ptype;
1758        struct sk_buff *skb2 = NULL;
1759        struct packet_type *pt_prev = NULL;
1760
1761        rcu_read_lock();
1762        list_for_each_entry_rcu(ptype, &ptype_all, list) {
1763                /* Never send packets back to the socket
1764                 * they originated from - MvS (miquels@drinkel.ow.org)
1765                 */
1766                if ((ptype->dev == dev || !ptype->dev) &&
1767                    (!skb_loop_sk(ptype, skb))) {
1768                        if (pt_prev) {
1769                                deliver_skb(skb2, pt_prev, skb->dev);
1770                                pt_prev = ptype;
1771                                continue;
1772                        }
1773
1774                        skb2 = skb_clone(skb, GFP_ATOMIC);
1775                        if (!skb2)
1776                                break;
1777
1778                        net_timestamp_set(skb2);
1779
1780                        /* skb->nh should be correctly
1781                           set by sender, so that the second statement is
1782                           just protection against buggy protocols.
1783                         */
1784                        skb_reset_mac_header(skb2);
1785
1786                        if (skb_network_header(skb2) < skb2->data ||
1787                            skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1788                                net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1789                                                     ntohs(skb2->protocol),
1790                                                     dev->name);
1791                                skb_reset_network_header(skb2);
1792                        }
1793
1794                        skb2->transport_header = skb2->network_header;
1795                        skb2->pkt_type = PACKET_OUTGOING;
1796                        pt_prev = ptype;
1797                }
1798        }
1799        if (pt_prev)
1800                pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1801        rcu_read_unlock();
1802}
1803
1804/**
1805 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1806 * @dev: Network device
1807 * @txq: number of queues available
1808 *
1809 * If real_num_tx_queues is changed the tc mappings may no longer be
1810 * valid. To resolve this verify the tc mapping remains valid and if
1811 * not NULL the mapping. With no priorities mapping to this
1812 * offset/count pair it will no longer be used. In the worst case TC0
1813 * is invalid nothing can be done so disable priority mappings. If is
1814 * expected that drivers will fix this mapping if they can before
1815 * calling netif_set_real_num_tx_queues.
1816 */
1817static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1818{
1819        int i;
1820        struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1821
1822        /* If TC0 is invalidated disable TC mapping */
1823        if (tc->offset + tc->count > txq) {
1824                pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1825                dev->num_tc = 0;
1826                return;
1827        }
1828
1829        /* Invalidated prio to tc mappings set to TC0 */
1830        for (i = 1; i < TC_BITMASK + 1; i++) {
1831                int q = netdev_get_prio_tc_map(dev, i);
1832
1833                tc = &dev->tc_to_txq[q];
1834                if (tc->offset + tc->count > txq) {
1835                        pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1836                                i, q);
1837                        netdev_set_prio_tc_map(dev, i, 0);
1838                }
1839        }
1840}
1841
1842#ifdef CONFIG_XPS
1843static DEFINE_MUTEX(xps_map_mutex);
1844#define xmap_dereference(P)             \
1845        rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1846
1847static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1848                                        int cpu, u16 index)
1849{
1850        struct xps_map *map = NULL;
1851        int pos;
1852
1853        if (dev_maps)
1854                map = xmap_dereference(dev_maps->cpu_map[cpu]);
1855
1856        for (pos = 0; map && pos < map->len; pos++) {
1857                if (map->queues[pos] == index) {
1858                        if (map->len > 1) {
1859                                map->queues[pos] = map->queues[--map->len];
1860                        } else {
1861                                RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1862                                kfree_rcu(map, rcu);
1863                                map = NULL;
1864                        }
1865                        break;
1866                }
1867        }
1868
1869        return map;
1870}
1871
1872static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1873{
1874        struct xps_dev_maps *dev_maps;
1875        int cpu, i;
1876        bool active = false;
1877
1878        mutex_lock(&xps_map_mutex);
1879        dev_maps = xmap_dereference(dev->xps_maps);
1880
1881        if (!dev_maps)
1882                goto out_no_maps;
1883
1884        for_each_possible_cpu(cpu) {
1885                for (i = index; i < dev->num_tx_queues; i++) {
1886                        if (!remove_xps_queue(dev_maps, cpu, i))
1887                                break;
1888                }
1889                if (i == dev->num_tx_queues)
1890                        active = true;
1891        }
1892
1893        if (!active) {
1894                RCU_INIT_POINTER(dev->xps_maps, NULL);
1895                kfree_rcu(dev_maps, rcu);
1896        }
1897
1898        for (i = index; i < dev->num_tx_queues; i++)
1899                netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1900                                             NUMA_NO_NODE);
1901
1902out_no_maps:
1903        mutex_unlock(&xps_map_mutex);
1904}
1905
1906static struct xps_map *expand_xps_map(struct xps_map *map,
1907                                      int cpu, u16 index)
1908{
1909        struct xps_map *new_map;
1910        int alloc_len = XPS_MIN_MAP_ALLOC;
1911        int i, pos;
1912
1913        for (pos = 0; map && pos < map->len; pos++) {
1914                if (map->queues[pos] != index)
1915                        continue;
1916                return map;
1917        }
1918
1919        /* Need to add queue to this CPU's existing map */
1920        if (map) {
1921                if (pos < map->alloc_len)
1922                        return map;
1923
1924                alloc_len = map->alloc_len * 2;
1925        }
1926
1927        /* Need to allocate new map to store queue on this CPU's map */
1928        new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1929                               cpu_to_node(cpu));
1930        if (!new_map)
1931                return NULL;
1932
1933        for (i = 0; i < pos; i++)
1934                new_map->queues[i] = map->queues[i];
1935        new_map->alloc_len = alloc_len;
1936        new_map->len = pos;
1937
1938        return new_map;
1939}
1940
1941int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1942                        u16 index)
1943{
1944        struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1945        struct xps_map *map, *new_map;
1946        int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1947        int cpu, numa_node_id = -2;
1948        bool active = false;
1949
1950        mutex_lock(&xps_map_mutex);
1951
1952        dev_maps = xmap_dereference(dev->xps_maps);
1953
1954        /* allocate memory for queue storage */
1955        for_each_online_cpu(cpu) {
1956                if (!cpumask_test_cpu(cpu, mask))
1957                        continue;
1958
1959                if (!new_dev_maps)
1960                        new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1961                if (!new_dev_maps) {
1962                        mutex_unlock(&xps_map_mutex);
1963                        return -ENOMEM;
1964                }
1965
1966                map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1967                                 NULL;
1968
1969                map = expand_xps_map(map, cpu, index);
1970                if (!map)
1971                        goto error;
1972
1973                RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1974        }
1975
1976        if (!new_dev_maps)
1977                goto out_no_new_maps;
1978
1979        for_each_possible_cpu(cpu) {
1980                if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1981                        /* add queue to CPU maps */
1982                        int pos = 0;
1983
1984                        map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1985                        while ((pos < map->len) && (map->queues[pos] != index))
1986                                pos++;
1987
1988                        if (pos == map->len)
1989                                map->queues[map->len++] = index;
1990#ifdef CONFIG_NUMA
1991                        if (numa_node_id == -2)
1992                                numa_node_id = cpu_to_node(cpu);
1993                        else if (numa_node_id != cpu_to_node(cpu))
1994                                numa_node_id = -1;
1995#endif
1996                } else if (dev_maps) {
1997                        /* fill in the new device map from the old device map */
1998                        map = xmap_dereference(dev_maps->cpu_map[cpu]);
1999                        RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2000                }
2001
2002        }
2003
2004        rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2005
2006        /* Cleanup old maps */
2007        if (dev_maps) {
2008                for_each_possible_cpu(cpu) {
2009                        new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2010                        map = xmap_dereference(dev_maps->cpu_map[cpu]);
2011                        if (map && map != new_map)
2012                                kfree_rcu(map, rcu);
2013                }
2014
2015                kfree_rcu(dev_maps, rcu);
2016        }
2017
2018        dev_maps = new_dev_maps;
2019        active = true;
2020
2021out_no_new_maps:
2022        /* update Tx queue numa node */
2023        netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2024                                     (numa_node_id >= 0) ? numa_node_id :
2025                                     NUMA_NO_NODE);
2026
2027        if (!dev_maps)
2028                goto out_no_maps;
2029
2030        /* removes queue from unused CPUs */
2031        for_each_possible_cpu(cpu) {
2032                if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2033                        continue;
2034
2035                if (remove_xps_queue(dev_maps, cpu, index))
2036                        active = true;
2037        }
2038
2039        /* free map if not active */
2040        if (!active) {
2041                RCU_INIT_POINTER(dev->xps_maps, NULL);
2042                kfree_rcu(dev_maps, rcu);
2043        }
2044
2045out_no_maps:
2046        mutex_unlock(&xps_map_mutex);
2047
2048        return 0;
2049error:
2050        /* remove any maps that we added */
2051        for_each_possible_cpu(cpu) {
2052                new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2053                map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2054                                 NULL;
2055                if (new_map && new_map != map)
2056                        kfree(new_map);
2057        }
2058
2059        mutex_unlock(&xps_map_mutex);
2060
2061        kfree(new_dev_maps);
2062        return -ENOMEM;
2063}
2064EXPORT_SYMBOL(netif_set_xps_queue);
2065
2066#endif
2067/*
2068 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2069 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2070 */
2071int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2072{
2073        int rc;
2074
2075        if (txq < 1 || txq > dev->num_tx_queues)
2076                return -EINVAL;
2077
2078        if (dev->reg_state == NETREG_REGISTERED ||
2079            dev->reg_state == NETREG_UNREGISTERING) {
2080                ASSERT_RTNL();
2081
2082                rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2083                                                  txq);
2084                if (rc)
2085                        return rc;
2086
2087                if (dev->num_tc)
2088                        netif_setup_tc(dev, txq);
2089
2090                if (txq < dev->real_num_tx_queues) {
2091                        qdisc_reset_all_tx_gt(dev, txq);
2092#ifdef CONFIG_XPS
2093                        netif_reset_xps_queues_gt(dev, txq);
2094#endif
2095                }
2096        }
2097
2098        dev->real_num_tx_queues = txq;
2099        return 0;
2100}
2101EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2102
2103#ifdef CONFIG_SYSFS
2104/**
2105 *      netif_set_real_num_rx_queues - set actual number of RX queues used
2106 *      @dev: Network device
2107 *      @rxq: Actual number of RX queues
2108 *
2109 *      This must be called either with the rtnl_lock held or before
2110 *      registration of the net device.  Returns 0 on success, or a
2111 *      negative error code.  If called before registration, it always
2112 *      succeeds.
2113 */
2114int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2115{
2116        int rc;
2117
2118        if (rxq < 1 || rxq > dev->num_rx_queues)
2119                return -EINVAL;
2120
2121        if (dev->reg_state == NETREG_REGISTERED) {
2122                ASSERT_RTNL();
2123
2124                rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2125                                                  rxq);
2126                if (rc)
2127                        return rc;
2128        }
2129
2130        dev->real_num_rx_queues = rxq;
2131        return 0;
2132}
2133EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2134#endif
2135
2136/**
2137 * netif_get_num_default_rss_queues - default number of RSS queues
2138 *
2139 * This routine should set an upper limit on the number of RSS queues
2140 * used by default by multiqueue devices.
2141 */
2142int netif_get_num_default_rss_queues(void)
2143{
2144        return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2145}
2146EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2147
2148static inline void __netif_reschedule(struct Qdisc *q)
2149{
2150        struct softnet_data *sd;
2151        unsigned long flags;
2152
2153        local_irq_save(flags);
2154        sd = this_cpu_ptr(&softnet_data);
2155        q->next_sched = NULL;
2156        *sd->output_queue_tailp = q;
2157        sd->output_queue_tailp = &q->next_sched;
2158        raise_softirq_irqoff(NET_TX_SOFTIRQ);
2159        local_irq_restore(flags);
2160}
2161
2162void __netif_schedule(struct Qdisc *q)
2163{
2164        if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2165                __netif_reschedule(q);
2166}
2167EXPORT_SYMBOL(__netif_schedule);
2168
2169struct dev_kfree_skb_cb {
2170        enum skb_free_reason reason;
2171};
2172
2173static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2174{
2175        return (struct dev_kfree_skb_cb *)skb->cb;
2176}
2177
2178void netif_schedule_queue(struct netdev_queue *txq)
2179{
2180        rcu_read_lock();
2181        if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2182                struct Qdisc *q = rcu_dereference(txq->qdisc);
2183
2184                __netif_schedule(q);
2185        }
2186        rcu_read_unlock();
2187}
2188EXPORT_SYMBOL(netif_schedule_queue);
2189
2190/**
2191 *      netif_wake_subqueue - allow sending packets on subqueue
2192 *      @dev: network device
2193 *      @queue_index: sub queue index
2194 *
2195 * Resume individual transmit queue of a device with multiple transmit queues.
2196 */
2197void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2198{
2199        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2200
2201        if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2202                struct Qdisc *q;
2203
2204                rcu_read_lock();
2205                q = rcu_dereference(txq->qdisc);
2206                __netif_schedule(q);
2207                rcu_read_unlock();
2208        }
2209}
2210EXPORT_SYMBOL(netif_wake_subqueue);
2211
2212void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2213{
2214        if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2215                struct Qdisc *q;
2216
2217                rcu_read_lock();
2218                q = rcu_dereference(dev_queue->qdisc);
2219                __netif_schedule(q);
2220                rcu_read_unlock();
2221        }
2222}
2223EXPORT_SYMBOL(netif_tx_wake_queue);
2224
2225void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2226{
2227        unsigned long flags;
2228
2229        if (likely(atomic_read(&skb->users) == 1)) {
2230                smp_rmb();
2231                atomic_set(&skb->users, 0);
2232        } else if (likely(!atomic_dec_and_test(&skb->users))) {
2233                return;
2234        }
2235        get_kfree_skb_cb(skb)->reason = reason;
2236        local_irq_save(flags);
2237        skb->next = __this_cpu_read(softnet_data.completion_queue);
2238        __this_cpu_write(softnet_data.completion_queue, skb);
2239        raise_softirq_irqoff(NET_TX_SOFTIRQ);
2240        local_irq_restore(flags);
2241}
2242EXPORT_SYMBOL(__dev_kfree_skb_irq);
2243
2244void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2245{
2246        if (in_irq() || irqs_disabled())
2247                __dev_kfree_skb_irq(skb, reason);
2248        else
2249                dev_kfree_skb(skb);
2250}
2251EXPORT_SYMBOL(__dev_kfree_skb_any);
2252
2253
2254/**
2255 * netif_device_detach - mark device as removed
2256 * @dev: network device
2257 *
2258 * Mark device as removed from system and therefore no longer available.
2259 */
2260void netif_device_detach(struct net_device *dev)
2261{
2262        if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2263            netif_running(dev)) {
2264                netif_tx_stop_all_queues(dev);
2265        }
2266}
2267EXPORT_SYMBOL(netif_device_detach);
2268
2269/**
2270 * netif_device_attach - mark device as attached
2271 * @dev: network device
2272 *
2273 * Mark device as attached from system and restart if needed.
2274 */
2275void netif_device_attach(struct net_device *dev)
2276{
2277        if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2278            netif_running(dev)) {
2279                netif_tx_wake_all_queues(dev);
2280                __netdev_watchdog_up(dev);
2281        }
2282}
2283EXPORT_SYMBOL(netif_device_attach);
2284
2285static void skb_warn_bad_offload(const struct sk_buff *skb)
2286{
2287        static const netdev_features_t null_features = 0;
2288        struct net_device *dev = skb->dev;
2289        const char *driver = "";
2290
2291        if (!net_ratelimit())
2292                return;
2293
2294        if (dev && dev->dev.parent)
2295                driver = dev_driver_string(dev->dev.parent);
2296
2297        WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2298             "gso_type=%d ip_summed=%d\n",
2299             driver, dev ? &dev->features : &null_features,
2300             skb->sk ? &skb->sk->sk_route_caps : &null_features,
2301             skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2302             skb_shinfo(skb)->gso_type, skb->ip_summed);
2303}
2304
2305/*
2306 * Invalidate hardware checksum when packet is to be mangled, and
2307 * complete checksum manually on outgoing path.
2308 */
2309int skb_checksum_help(struct sk_buff *skb)
2310{
2311        __wsum csum;
2312        int ret = 0, offset;
2313
2314        if (skb->ip_summed == CHECKSUM_COMPLETE)
2315                goto out_set_summed;
2316
2317        if (unlikely(skb_shinfo(skb)->gso_size)) {
2318                skb_warn_bad_offload(skb);
2319                return -EINVAL;
2320        }
2321
2322        /* Before computing a checksum, we should make sure no frag could
2323         * be modified by an external entity : checksum could be wrong.
2324         */
2325        if (skb_has_shared_frag(skb)) {
2326                ret = __skb_linearize(skb);
2327                if (ret)
2328                        goto out;
2329        }
2330
2331        offset = skb_checksum_start_offset(skb);
2332        BUG_ON(offset >= skb_headlen(skb));
2333        csum = skb_checksum(skb, offset, skb->len - offset, 0);
2334
2335        offset += skb->csum_offset;
2336        BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2337
2338        if (skb_cloned(skb) &&
2339            !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2340                ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2341                if (ret)
2342                        goto out;
2343        }
2344
2345        *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2346out_set_summed:
2347        skb->ip_summed = CHECKSUM_NONE;
2348out:
2349        return ret;
2350}
2351EXPORT_SYMBOL(skb_checksum_help);
2352
2353__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2354{
2355        __be16 type = skb->protocol;
2356
2357        /* Tunnel gso handlers can set protocol to ethernet. */
2358        if (type == htons(ETH_P_TEB)) {
2359                struct ethhdr *eth;
2360
2361                if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2362                        return 0;
2363
2364                eth = (struct ethhdr *)skb_mac_header(skb);
2365                type = eth->h_proto;
2366        }
2367
2368        return __vlan_get_protocol(skb, type, depth);
2369}
2370
2371/**
2372 *      skb_mac_gso_segment - mac layer segmentation handler.
2373 *      @skb: buffer to segment
2374 *      @features: features for the output path (see dev->features)
2375 */
2376struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2377                                    netdev_features_t features)
2378{
2379        struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2380        struct packet_offload *ptype;
2381        int vlan_depth = skb->mac_len;
2382        __be16 type = skb_network_protocol(skb, &vlan_depth);
2383
2384        if (unlikely(!type))
2385                return ERR_PTR(-EINVAL);
2386
2387        __skb_pull(skb, vlan_depth);
2388
2389        rcu_read_lock();
2390        list_for_each_entry_rcu(ptype, &offload_base, list) {
2391                if (ptype->type == type && ptype->callbacks.gso_segment) {
2392                        segs = ptype->callbacks.gso_segment(skb, features);
2393                        break;
2394                }
2395        }
2396        rcu_read_unlock();
2397
2398        __skb_push(skb, skb->data - skb_mac_header(skb));
2399
2400        return segs;
2401}
2402EXPORT_SYMBOL(skb_mac_gso_segment);
2403
2404
2405/* openvswitch calls this on rx path, so we need a different check.
2406 */
2407static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2408{
2409        if (tx_path)
2410                return skb->ip_summed != CHECKSUM_PARTIAL;
2411        else
2412                return skb->ip_summed == CHECKSUM_NONE;
2413}
2414
2415/**
2416 *      __skb_gso_segment - Perform segmentation on skb.
2417 *      @skb: buffer to segment
2418 *      @features: features for the output path (see dev->features)
2419 *      @tx_path: whether it is called in TX path
2420 *
2421 *      This function segments the given skb and returns a list of segments.
2422 *
2423 *      It may return NULL if the skb requires no segmentation.  This is
2424 *      only possible when GSO is used for verifying header integrity.
2425 */
2426struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2427                                  netdev_features_t features, bool tx_path)
2428{
2429        if (unlikely(skb_needs_check(skb, tx_path))) {
2430                int err;
2431
2432                skb_warn_bad_offload(skb);
2433
2434                err = skb_cow_head(skb, 0);
2435                if (err < 0)
2436                        return ERR_PTR(err);
2437        }
2438
2439        SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2440        SKB_GSO_CB(skb)->encap_level = 0;
2441
2442        skb_reset_mac_header(skb);
2443        skb_reset_mac_len(skb);
2444
2445        return skb_mac_gso_segment(skb, features);
2446}
2447EXPORT_SYMBOL(__skb_gso_segment);
2448
2449/* Take action when hardware reception checksum errors are detected. */
2450#ifdef CONFIG_BUG
2451void netdev_rx_csum_fault(struct net_device *dev)
2452{
2453        if (net_ratelimit()) {
2454                pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2455                dump_stack();
2456        }
2457}
2458EXPORT_SYMBOL(netdev_rx_csum_fault);
2459#endif
2460
2461/* Actually, we should eliminate this check as soon as we know, that:
2462 * 1. IOMMU is present and allows to map all the memory.
2463 * 2. No high memory really exists on this machine.
2464 */
2465
2466static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2467{
2468#ifdef CONFIG_HIGHMEM
2469        int i;
2470        if (!(dev->features & NETIF_F_HIGHDMA)) {
2471                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2472                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2473                        if (PageHighMem(skb_frag_page(frag)))
2474                                return 1;
2475                }
2476        }
2477
2478        if (PCI_DMA_BUS_IS_PHYS) {
2479                struct device *pdev = dev->dev.parent;
2480
2481                if (!pdev)
2482                        return 0;
2483                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2484                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2485                        dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2486                        if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2487                                return 1;
2488                }
2489        }
2490#endif
2491        return 0;
2492}
2493
2494/* If MPLS offload request, verify we are testing hardware MPLS features
2495 * instead of standard features for the netdev.
2496 */
2497#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2498static netdev_features_t net_mpls_features(struct sk_buff *skb,
2499                                           netdev_features_t features,
2500                                           __be16 type)
2501{
2502        if (eth_p_mpls(type))
2503                features &= skb->dev->mpls_features;
2504
2505        return features;
2506}
2507#else
2508static netdev_features_t net_mpls_features(struct sk_buff *skb,
2509                                           netdev_features_t features,
2510                                           __be16 type)
2511{
2512        return features;
2513}
2514#endif
2515
2516static netdev_features_t harmonize_features(struct sk_buff *skb,
2517        netdev_features_t features)
2518{
2519        int tmp;
2520        __be16 type;
2521
2522        type = skb_network_protocol(skb, &tmp);
2523        features = net_mpls_features(skb, features, type);
2524
2525        if (skb->ip_summed != CHECKSUM_NONE &&
2526            !can_checksum_protocol(features, type)) {
2527                features &= ~NETIF_F_ALL_CSUM;
2528        } else if (illegal_highdma(skb->dev, skb)) {
2529                features &= ~NETIF_F_SG;
2530        }
2531
2532        return features;
2533}
2534
2535netdev_features_t netif_skb_features(struct sk_buff *skb)
2536{
2537        struct net_device *dev = skb->dev;
2538        netdev_features_t features = dev->features;
2539        u16 gso_segs = skb_shinfo(skb)->gso_segs;
2540        __be16 protocol = skb->protocol;
2541
2542        if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2543                features &= ~NETIF_F_GSO_MASK;
2544
2545        /* If encapsulation offload request, verify we are testing
2546         * hardware encapsulation features instead of standard
2547         * features for the netdev
2548         */
2549        if (skb->encapsulation)
2550                features &= dev->hw_enc_features;
2551
2552        if (!vlan_tx_tag_present(skb)) {
2553                if (unlikely(protocol == htons(ETH_P_8021Q) ||
2554                             protocol == htons(ETH_P_8021AD))) {
2555                        struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2556                        protocol = veh->h_vlan_encapsulated_proto;
2557                } else {
2558                        goto finalize;
2559                }
2560        }
2561
2562        features = netdev_intersect_features(features,
2563                                             dev->vlan_features |
2564                                             NETIF_F_HW_VLAN_CTAG_TX |
2565                                             NETIF_F_HW_VLAN_STAG_TX);
2566
2567        if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2568                features = netdev_intersect_features(features,
2569                                                     NETIF_F_SG |
2570                                                     NETIF_F_HIGHDMA |
2571                                                     NETIF_F_FRAGLIST |
2572                                                     NETIF_F_GEN_CSUM |
2573                                                     NETIF_F_HW_VLAN_CTAG_TX |
2574                                                     NETIF_F_HW_VLAN_STAG_TX);
2575
2576finalize:
2577        if (dev->netdev_ops->ndo_features_check)
2578                features &= dev->netdev_ops->ndo_features_check(skb, dev,
2579                                                                features);
2580
2581        return harmonize_features(skb, features);
2582}
2583EXPORT_SYMBOL(netif_skb_features);
2584
2585static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2586                    struct netdev_queue *txq, bool more)
2587{
2588        unsigned int len;
2589        int rc;
2590
2591        if (!list_empty(&ptype_all))
2592                dev_queue_xmit_nit(skb, dev);
2593
2594        len = skb->len;
2595        trace_net_dev_start_xmit(skb, dev);
2596        rc = netdev_start_xmit(skb, dev, txq, more);
2597        trace_net_dev_xmit(skb, rc, dev, len);
2598
2599        return rc;
2600}
2601
2602struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2603                                    struct netdev_queue *txq, int *ret)
2604{
2605        struct sk_buff *skb = first;
2606        int rc = NETDEV_TX_OK;
2607
2608        while (skb) {
2609                struct sk_buff *next = skb->next;
2610
2611                skb->next = NULL;
2612                rc = xmit_one(skb, dev, txq, next != NULL);
2613                if (unlikely(!dev_xmit_complete(rc))) {
2614                        skb->next = next;
2615                        goto out;
2616                }
2617
2618                skb = next;
2619                if (netif_xmit_stopped(txq) && skb) {
2620                        rc = NETDEV_TX_BUSY;
2621                        break;
2622                }
2623        }
2624
2625out:
2626        *ret = rc;
2627        return skb;
2628}
2629
2630static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2631                                          netdev_features_t features)
2632{
2633        if (vlan_tx_tag_present(skb) &&
2634            !vlan_hw_offload_capable(features, skb->vlan_proto))
2635                skb = __vlan_hwaccel_push_inside(skb);
2636        return skb;
2637}
2638
2639static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2640{
2641        netdev_features_t features;
2642
2643        if (skb->next)
2644                return skb;
2645
2646        features = netif_skb_features(skb);
2647        skb = validate_xmit_vlan(skb, features);
2648        if (unlikely(!skb))
2649                goto out_null;
2650
2651        if (netif_needs_gso(dev, skb, features)) {
2652                struct sk_buff *segs;
2653
2654                segs = skb_gso_segment(skb, features);
2655                if (IS_ERR(segs)) {
2656                        goto out_kfree_skb;
2657                } else if (segs) {
2658                        consume_skb(skb);
2659                        skb = segs;
2660                }
2661        } else {
2662                if (skb_needs_linearize(skb, features) &&
2663                    __skb_linearize(skb))
2664                        goto out_kfree_skb;
2665
2666                /* If packet is not checksummed and device does not
2667                 * support checksumming for this protocol, complete
2668                 * checksumming here.
2669                 */
2670                if (skb->ip_summed == CHECKSUM_PARTIAL) {
2671                        if (skb->encapsulation)
2672                                skb_set_inner_transport_header(skb,
2673                                                               skb_checksum_start_offset(skb));
2674                        else
2675                                skb_set_transport_header(skb,
2676                                                         skb_checksum_start_offset(skb));
2677                        if (!(features & NETIF_F_ALL_CSUM) &&
2678                            skb_checksum_help(skb))
2679                                goto out_kfree_skb;
2680                }
2681        }
2682
2683        return skb;
2684
2685out_kfree_skb:
2686        kfree_skb(skb);
2687out_null:
2688        return NULL;
2689}
2690
2691struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2692{
2693        struct sk_buff *next, *head = NULL, *tail;
2694
2695        for (; skb != NULL; skb = next) {
2696                next = skb->next;
2697                skb->next = NULL;
2698
2699                /* in case skb wont be segmented, point to itself */
2700                skb->prev = skb;
2701
2702                skb = validate_xmit_skb(skb, dev);
2703                if (!skb)
2704                        continue;
2705
2706                if (!head)
2707                        head = skb;
2708                else
2709                        tail->next = skb;
2710                /* If skb was segmented, skb->prev points to
2711                 * the last segment. If not, it still contains skb.
2712                 */
2713                tail = skb->prev;
2714        }
2715        return head;
2716}
2717
2718static void qdisc_pkt_len_init(struct sk_buff *skb)
2719{
2720        const struct skb_shared_info *shinfo = skb_shinfo(skb);
2721
2722        qdisc_skb_cb(skb)->pkt_len = skb->len;
2723
2724        /* To get more precise estimation of bytes sent on wire,
2725         * we add to pkt_len the headers size of all segments
2726         */
2727        if (shinfo->gso_size)  {
2728                unsigned int hdr_len;
2729                u16 gso_segs = shinfo->gso_segs;
2730
2731                /* mac layer + network layer */
2732                hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2733
2734                /* + transport layer */
2735                if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2736                        hdr_len += tcp_hdrlen(skb);
2737                else
2738                        hdr_len += sizeof(struct udphdr);
2739
2740                if (shinfo->gso_type & SKB_GSO_DODGY)
2741                        gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2742                                                shinfo->gso_size);
2743
2744                qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2745        }
2746}
2747
2748static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2749                                 struct net_device *dev,
2750                                 struct netdev_queue *txq)
2751{
2752        spinlock_t *root_lock = qdisc_lock(q);
2753        bool contended;
2754        int rc;
2755
2756        qdisc_pkt_len_init(skb);
2757        qdisc_calculate_pkt_len(skb, q);
2758        /*
2759         * Heuristic to force contended enqueues to serialize on a
2760         * separate lock before trying to get qdisc main lock.
2761         * This permits __QDISC___STATE_RUNNING owner to get the lock more
2762         * often and dequeue packets faster.
2763         */
2764        contended = qdisc_is_running(q);
2765        if (unlikely(contended))
2766                spin_lock(&q->busylock);
2767
2768        spin_lock(root_lock);
2769        if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2770                kfree_skb(skb);
2771                rc = NET_XMIT_DROP;
2772        } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2773                   qdisc_run_begin(q)) {
2774                /*
2775                 * This is a work-conserving queue; there are no old skbs
2776                 * waiting to be sent out; and the qdisc is not running -
2777                 * xmit the skb directly.
2778                 */
2779
2780                qdisc_bstats_update(q, skb);
2781
2782                if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2783                        if (unlikely(contended)) {
2784                                spin_unlock(&q->busylock);
2785                                contended = false;
2786                        }
2787                        __qdisc_run(q);
2788                } else
2789                        qdisc_run_end(q);
2790
2791                rc = NET_XMIT_SUCCESS;
2792        } else {
2793                rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2794                if (qdisc_run_begin(q)) {
2795                        if (unlikely(contended)) {
2796                                spin_unlock(&q->busylock);
2797                                contended = false;
2798                        }
2799                        __qdisc_run(q);
2800                }
2801        }
2802        spin_unlock(root_lock);
2803        if (unlikely(contended))
2804                spin_unlock(&q->busylock);
2805        return rc;
2806}
2807
2808#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2809static void skb_update_prio(struct sk_buff *skb)
2810{
2811        struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2812
2813        if (!skb->priority && skb->sk && map) {
2814                unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2815
2816                if (prioidx < map->priomap_len)
2817                        skb->priority = map->priomap[prioidx];
2818        }
2819}
2820#else
2821#define skb_update_prio(skb)
2822#endif
2823
2824static DEFINE_PER_CPU(int, xmit_recursion);
2825#define RECURSION_LIMIT 10
2826
2827/**
2828 *      dev_loopback_xmit - loop back @skb
2829 *      @skb: buffer to transmit
2830 */
2831int dev_loopback_xmit(struct sk_buff *skb)
2832{
2833        skb_reset_mac_header(skb);
2834        __skb_pull(skb, skb_network_offset(skb));
2835        skb->pkt_type = PACKET_LOOPBACK;
2836        skb->ip_summed = CHECKSUM_UNNECESSARY;
2837        WARN_ON(!skb_dst(skb));
2838        skb_dst_force(skb);
2839        netif_rx_ni(skb);
2840        return 0;
2841}
2842EXPORT_SYMBOL(dev_loopback_xmit);
2843
2844/**
2845 *      __dev_queue_xmit - transmit a buffer
2846 *      @skb: buffer to transmit
2847 *      @accel_priv: private data used for L2 forwarding offload
2848 *
2849 *      Queue a buffer for transmission to a network device. The caller must
2850 *      have set the device and priority and built the buffer before calling
2851 *      this function. The function can be called from an interrupt.
2852 *
2853 *      A negative errno code is returned on a failure. A success does not
2854 *      guarantee the frame will be transmitted as it may be dropped due
2855 *      to congestion or traffic shaping.
2856 *
2857 * -----------------------------------------------------------------------------------
2858 *      I notice this method can also return errors from the queue disciplines,
2859 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2860 *      be positive.
2861 *
2862 *      Regardless of the return value, the skb is consumed, so it is currently
2863 *      difficult to retry a send to this method.  (You can bump the ref count
2864 *      before sending to hold a reference for retry if you are careful.)
2865 *
2866 *      When calling this method, interrupts MUST be enabled.  This is because
2867 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2868 *          --BLG
2869 */
2870static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2871{
2872        struct net_device *dev = skb->dev;
2873        struct netdev_queue *txq;
2874        struct Qdisc *q;
2875        int rc = -ENOMEM;
2876
2877        skb_reset_mac_header(skb);
2878
2879        if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2880                __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2881
2882        /* Disable soft irqs for various locks below. Also
2883         * stops preemption for RCU.
2884         */
2885        rcu_read_lock_bh();
2886
2887        skb_update_prio(skb);
2888
2889        /* If device/qdisc don't need skb->dst, release it right now while
2890         * its hot in this cpu cache.
2891         */
2892        if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2893                skb_dst_drop(skb);
2894        else
2895                skb_dst_force(skb);
2896
2897        txq = netdev_pick_tx(dev, skb, accel_priv);
2898        q = rcu_dereference_bh(txq->qdisc);
2899
2900#ifdef CONFIG_NET_CLS_ACT
2901        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2902#endif
2903        trace_net_dev_queue(skb);
2904        if (q->enqueue) {
2905                rc = __dev_xmit_skb(skb, q, dev, txq);
2906                goto out;
2907        }
2908
2909        /* The device has no queue. Common case for software devices:
2910           loopback, all the sorts of tunnels...
2911
2912           Really, it is unlikely that netif_tx_lock protection is necessary
2913           here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2914           counters.)
2915           However, it is possible, that they rely on protection
2916           made by us here.
2917
2918           Check this and shot the lock. It is not prone from deadlocks.
2919           Either shot noqueue qdisc, it is even simpler 8)
2920         */
2921        if (dev->flags & IFF_UP) {
2922                int cpu = smp_processor_id(); /* ok because BHs are off */
2923
2924                if (txq->xmit_lock_owner != cpu) {
2925
2926                        if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2927                                goto recursion_alert;
2928
2929                        skb = validate_xmit_skb(skb, dev);
2930                        if (!skb)
2931                                goto drop;
2932
2933                        HARD_TX_LOCK(dev, txq, cpu);
2934
2935                        if (!netif_xmit_stopped(txq)) {
2936                                __this_cpu_inc(xmit_recursion);
2937                                skb = dev_hard_start_xmit(skb, dev, txq, &rc);
2938                                __this_cpu_dec(xmit_recursion);
2939                                if (dev_xmit_complete(rc)) {
2940                                        HARD_TX_UNLOCK(dev, txq);
2941                                        goto out;
2942                                }
2943                        }
2944                        HARD_TX_UNLOCK(dev, txq);
2945                        net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2946                                             dev->name);
2947                } else {
2948                        /* Recursion is detected! It is possible,
2949                         * unfortunately
2950                         */
2951recursion_alert:
2952                        net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2953                                             dev->name);
2954                }
2955        }
2956
2957        rc = -ENETDOWN;
2958drop:
2959        rcu_read_unlock_bh();
2960
2961        atomic_long_inc(&dev->tx_dropped);
2962        kfree_skb_list(skb);
2963        return rc;
2964out:
2965        rcu_read_unlock_bh();
2966        return rc;
2967}
2968
2969int dev_queue_xmit(struct sk_buff *skb)
2970{
2971        return __dev_queue_xmit(skb, NULL);
2972}
2973EXPORT_SYMBOL(dev_queue_xmit);
2974
2975int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
2976{
2977        return __dev_queue_xmit(skb, accel_priv);
2978}
2979EXPORT_SYMBOL(dev_queue_xmit_accel);
2980
2981
2982/*=======================================================================
2983                        Receiver routines
2984  =======================================================================*/
2985
2986int netdev_max_backlog __read_mostly = 1000;
2987EXPORT_SYMBOL(netdev_max_backlog);
2988
2989int netdev_tstamp_prequeue __read_mostly = 1;
2990int netdev_budget __read_mostly = 300;
2991int weight_p __read_mostly = 64;            /* old backlog weight */
2992
2993/* Called with irq disabled */
2994static inline void ____napi_schedule(struct softnet_data *sd,
2995                                     struct napi_struct *napi)
2996{
2997        list_add_tail(&napi->poll_list, &sd->poll_list);
2998        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2999}
3000
3001#ifdef CONFIG_RPS
3002
3003/* One global table that all flow-based protocols share. */
3004struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3005EXPORT_SYMBOL(rps_sock_flow_table);
3006
3007struct static_key rps_needed __read_mostly;
3008
3009static struct rps_dev_flow *
3010set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3011            struct rps_dev_flow *rflow, u16 next_cpu)
3012{
3013        if (next_cpu != RPS_NO_CPU) {
3014#ifdef CONFIG_RFS_ACCEL
3015                struct netdev_rx_queue *rxqueue;
3016                struct rps_dev_flow_table *flow_table;
3017                struct rps_dev_flow *old_rflow;
3018                u32 flow_id;
3019                u16 rxq_index;
3020                int rc;
3021
3022                /* Should we steer this flow to a different hardware queue? */
3023                if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3024                    !(dev->features & NETIF_F_NTUPLE))
3025                        goto out;
3026                rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3027                if (rxq_index == skb_get_rx_queue(skb))
3028                        goto out;
3029
3030                rxqueue = dev->_rx + rxq_index;
3031                flow_table = rcu_dereference(rxqueue->rps_flow_table);
3032                if (!flow_table)
3033                        goto out;
3034                flow_id = skb_get_hash(skb) & flow_table->mask;
3035                rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3036                                                        rxq_index, flow_id);
3037                if (rc < 0)
3038                        goto out;
3039                old_rflow = rflow;
3040                rflow = &flow_table->flows[flow_id];
3041                rflow->filter = rc;
3042                if (old_rflow->filter == rflow->filter)
3043                        old_rflow->filter = RPS_NO_FILTER;
3044        out:
3045#endif
3046                rflow->last_qtail =
3047                        per_cpu(softnet_data, next_cpu).input_queue_head;
3048        }
3049
3050        rflow->cpu = next_cpu;
3051        return rflow;
3052}
3053
3054/*
3055 * get_rps_cpu is called from netif_receive_skb and returns the target
3056 * CPU from the RPS map of the receiving queue for a given skb.
3057 * rcu_read_lock must be held on entry.
3058 */
3059static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3060                       struct rps_dev_flow **rflowp)
3061{
3062        struct netdev_rx_queue *rxqueue;
3063        struct rps_map *map;
3064        struct rps_dev_flow_table *flow_table;
3065        struct rps_sock_flow_table *sock_flow_table;
3066        int cpu = -1;
3067        u16 tcpu;
3068        u32 hash;
3069
3070        if (skb_rx_queue_recorded(skb)) {
3071                u16 index = skb_get_rx_queue(skb);
3072                if (unlikely(index >= dev->real_num_rx_queues)) {
3073                        WARN_ONCE(dev->real_num_rx_queues > 1,
3074                                  "%s received packet on queue %u, but number "
3075                                  "of RX queues is %u\n",
3076                                  dev->name, index, dev->real_num_rx_queues);
3077                        goto done;
3078                }
3079                rxqueue = dev->_rx + index;
3080        } else
3081                rxqueue = dev->_rx;
3082
3083        map = rcu_dereference(rxqueue->rps_map);
3084        if (map) {
3085                if (map->len == 1 &&
3086                    !rcu_access_pointer(rxqueue->rps_flow_table)) {
3087                        tcpu = map->cpus[0];
3088                        if (cpu_online(tcpu))
3089                                cpu = tcpu;
3090                        goto done;
3091                }
3092        } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3093                goto done;
3094        }
3095
3096        skb_reset_network_header(skb);
3097        hash = skb_get_hash(skb);
3098        if (!hash)
3099                goto done;
3100
3101        flow_table = rcu_dereference(rxqueue->rps_flow_table);
3102        sock_flow_table = rcu_dereference(rps_sock_flow_table);
3103        if (flow_table && sock_flow_table) {
3104                u16 next_cpu;
3105                struct rps_dev_flow *rflow;
3106
3107                rflow = &flow_table->flows[hash & flow_table->mask];
3108                tcpu = rflow->cpu;
3109
3110                next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
3111
3112                /*
3113                 * If the desired CPU (where last recvmsg was done) is
3114                 * different from current CPU (one in the rx-queue flow
3115                 * table entry), switch if one of the following holds:
3116                 *   - Current CPU is unset (equal to RPS_NO_CPU).
3117                 *   - Current CPU is offline.
3118                 *   - The current CPU's queue tail has advanced beyond the
3119                 *     last packet that was enqueued using this table entry.
3120                 *     This guarantees that all previous packets for the flow
3121                 *     have been dequeued, thus preserving in order delivery.
3122                 */
3123                if (unlikely(tcpu != next_cpu) &&
3124                    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3125                     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3126                      rflow->last_qtail)) >= 0)) {
3127                        tcpu = next_cpu;
3128                        rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3129                }
3130
3131                if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3132                        *rflowp = rflow;
3133                        cpu = tcpu;
3134                        goto done;
3135                }
3136        }
3137
3138        if (map) {
3139                tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3140                if (cpu_online(tcpu)) {
3141                        cpu = tcpu;
3142                        goto done;
3143                }
3144        }
3145
3146done:
3147        return cpu;
3148}
3149
3150#ifdef CONFIG_RFS_ACCEL
3151
3152/**
3153 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3154 * @dev: Device on which the filter was set
3155 * @rxq_index: RX queue index
3156 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3157 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3158 *
3159 * Drivers that implement ndo_rx_flow_steer() should periodically call
3160 * this function for each installed filter and remove the filters for
3161 * which it returns %true.
3162 */
3163bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3164                         u32 flow_id, u16 filter_id)
3165{
3166        struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3167        struct rps_dev_flow_table *flow_table;
3168        struct rps_dev_flow *rflow;
3169        bool expire = true;
3170        int cpu;
3171
3172        rcu_read_lock();
3173        flow_table = rcu_dereference(rxqueue->rps_flow_table);
3174        if (flow_table && flow_id <= flow_table->mask) {
3175                rflow = &flow_table->flows[flow_id];
3176                cpu = ACCESS_ONCE(rflow->cpu);
3177                if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3178                    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3179                           rflow->last_qtail) <
3180                     (int)(10 * flow_table->mask)))
3181                        expire = false;
3182        }
3183        rcu_read_unlock();
3184        return expire;
3185}
3186EXPORT_SYMBOL(rps_may_expire_flow);
3187
3188#endif /* CONFIG_RFS_ACCEL */
3189
3190/* Called from hardirq (IPI) context */
3191static void rps_trigger_softirq(void *data)
3192{
3193        struct softnet_data *sd = data;
3194
3195        ____napi_schedule(sd, &sd->backlog);
3196        sd->received_rps++;
3197}
3198
3199#endif /* CONFIG_RPS */
3200
3201/*
3202 * Check if this softnet_data structure is another cpu one
3203 * If yes, queue it to our IPI list and return 1
3204 * If no, return 0
3205 */
3206static int rps_ipi_queued(struct softnet_data *sd)
3207{
3208#ifdef CONFIG_RPS
3209        struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3210
3211        if (sd != mysd) {
3212                sd->rps_ipi_next = mysd->rps_ipi_list;
3213                mysd->rps_ipi_list = sd;
3214
3215                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3216                return 1;
3217        }
3218#endif /* CONFIG_RPS */
3219        return 0;
3220}
3221
3222#ifdef CONFIG_NET_FLOW_LIMIT
3223int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3224#endif
3225
3226static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3227{
3228#ifdef CONFIG_NET_FLOW_LIMIT
3229        struct sd_flow_limit *fl;
3230        struct softnet_data *sd;
3231        unsigned int old_flow, new_flow;
3232
3233        if (qlen < (netdev_max_backlog >> 1))
3234                return false;
3235
3236        sd = this_cpu_ptr(&softnet_data);
3237
3238        rcu_read_lock();
3239        fl = rcu_dereference(sd->flow_limit);
3240        if (fl) {
3241                new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3242                old_flow = fl->history[fl->history_head];
3243                fl->history[fl->history_head] = new_flow;
3244
3245                fl->history_head++;
3246                fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3247
3248                if (likely(fl->buckets[old_flow]))
3249                        fl->buckets[old_flow]--;
3250
3251                if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3252                        fl->count++;
3253                        rcu_read_unlock();
3254                        return true;
3255                }
3256        }
3257        rcu_read_unlock();
3258#endif
3259        return false;
3260}
3261
3262/*
3263 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3264 * queue (may be a remote CPU queue).
3265 */
3266static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3267                              unsigned int *qtail)
3268{
3269        struct softnet_data *sd;
3270        unsigned long flags;
3271        unsigned int qlen;
3272
3273        sd = &per_cpu(softnet_data, cpu);
3274
3275        local_irq_save(flags);
3276
3277        rps_lock(sd);
3278        qlen = skb_queue_len(&sd->input_pkt_queue);
3279        if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3280                if (qlen) {
3281enqueue:
3282                        __skb_queue_tail(&sd->input_pkt_queue, skb);
3283                        input_queue_tail_incr_save(sd, qtail);
3284                        rps_unlock(sd);
3285                        local_irq_restore(flags);
3286                        return NET_RX_SUCCESS;
3287                }
3288
3289                /* Schedule NAPI for backlog device
3290                 * We can use non atomic operation since we own the queue lock
3291                 */
3292                if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3293                        if (!rps_ipi_queued(sd))
3294                                ____napi_schedule(sd, &sd->backlog);
3295                }
3296                goto enqueue;
3297        }
3298
3299        sd->dropped++;
3300        rps_unlock(sd);
3301
3302        local_irq_restore(flags);
3303
3304        atomic_long_inc(&skb->dev->rx_dropped);
3305        kfree_skb(skb);
3306        return NET_RX_DROP;
3307}
3308
3309static int netif_rx_internal(struct sk_buff *skb)
3310{
3311        int ret;
3312
3313        net_timestamp_check(netdev_tstamp_prequeue, skb);
3314
3315        trace_netif_rx(skb);
3316#ifdef CONFIG_RPS
3317        if (static_key_false(&rps_needed)) {
3318                struct rps_dev_flow voidflow, *rflow = &voidflow;
3319                int cpu;
3320
3321                preempt_disable();
3322                rcu_read_lock();
3323
3324                cpu = get_rps_cpu(skb->dev, skb, &rflow);
3325                if (cpu < 0)
3326                        cpu = smp_processor_id();
3327
3328                ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3329
3330                rcu_read_unlock();
3331                preempt_enable();
3332        } else
3333#endif
3334        {
3335                unsigned int qtail;
3336                ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3337                put_cpu();
3338        }
3339        return ret;
3340}
3341
3342/**
3343 *      netif_rx        -       post buffer to the network code
3344 *      @skb: buffer to post
3345 *
3346 *      This function receives a packet from a device driver and queues it for
3347 *      the upper (protocol) levels to process.  It always succeeds. The buffer
3348 *      may be dropped during processing for congestion control or by the
3349 *      protocol layers.
3350 *
3351 *      return values:
3352 *      NET_RX_SUCCESS  (no congestion)
3353 *      NET_RX_DROP     (packet was dropped)
3354 *
3355 */
3356
3357int netif_rx(struct sk_buff *skb)
3358{
3359        trace_netif_rx_entry(skb);
3360
3361        return netif_rx_internal(skb);
3362}
3363EXPORT_SYMBOL(netif_rx);
3364
3365int netif_rx_ni(struct sk_buff *skb)
3366{
3367        int err;
3368
3369        trace_netif_rx_ni_entry(skb);
3370
3371        preempt_disable();
3372        err = netif_rx_internal(skb);
3373        if (local_softirq_pending())
3374                do_softirq();
3375        preempt_enable();
3376
3377        return err;
3378}
3379EXPORT_SYMBOL(netif_rx_ni);
3380
3381static void net_tx_action(struct softirq_action *h)
3382{
3383        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3384
3385        if (sd->completion_queue) {
3386                struct sk_buff *clist;
3387
3388                local_irq_disable();
3389                clist = sd->completion_queue;
3390                sd->completion_queue = NULL;
3391                local_irq_enable();
3392
3393                while (clist) {
3394                        struct sk_buff *skb = clist;
3395                        clist = clist->next;
3396
3397                        WARN_ON(atomic_read(&skb->users));
3398                        if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3399                                trace_consume_skb(skb);
3400                        else
3401                                trace_kfree_skb(skb, net_tx_action);
3402                        __kfree_skb(skb);
3403                }
3404        }
3405
3406        if (sd->output_queue) {
3407                struct Qdisc *head;
3408
3409                local_irq_disable();
3410                head = sd->output_queue;
3411                sd->output_queue = NULL;
3412                sd->output_queue_tailp = &sd->output_queue;
3413                local_irq_enable();
3414
3415                while (head) {
3416                        struct Qdisc *q = head;
3417                        spinlock_t *root_lock;
3418
3419                        head = head->next_sched;
3420
3421                        root_lock = qdisc_lock(q);
3422                        if (spin_trylock(root_lock)) {
3423                                smp_mb__before_atomic();
3424                                clear_bit(__QDISC_STATE_SCHED,
3425                                          &q->state);
3426                                qdisc_run(q);
3427                                spin_unlock(root_lock);
3428                        } else {
3429                                if (!test_bit(__QDISC_STATE_DEACTIVATED,
3430                                              &q->state)) {
3431                                        __netif_reschedule(q);
3432                                } else {
3433                                        smp_mb__before_atomic();
3434                                        clear_bit(__QDISC_STATE_SCHED,
3435                                                  &q->state);
3436                                }
3437                        }
3438                }
3439        }
3440}
3441
3442#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3443    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3444/* This hook is defined here for ATM LANE */
3445int (*br_fdb_test_addr_hook)(struct net_device *dev,
3446                             unsigned char *addr) __read_mostly;
3447EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3448#endif
3449
3450#ifdef CONFIG_NET_CLS_ACT
3451/* TODO: Maybe we should just force sch_ingress to be compiled in
3452 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3453 * a compare and 2 stores extra right now if we dont have it on
3454 * but have CONFIG_NET_CLS_ACT
3455 * NOTE: This doesn't stop any functionality; if you dont have
3456 * the ingress scheduler, you just can't add policies on ingress.
3457 *
3458 */
3459static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3460{
3461        struct net_device *dev = skb->dev;
3462        u32 ttl = G_TC_RTTL(skb->tc_verd);
3463        int result = TC_ACT_OK;
3464        struct Qdisc *q;
3465
3466        if (unlikely(MAX_RED_LOOP < ttl++)) {
3467                net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3468                                     skb->skb_iif, dev->ifindex);
3469                return TC_ACT_SHOT;
3470        }
3471
3472        skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3473        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3474
3475        q = rcu_dereference(rxq->qdisc);
3476        if (q != &noop_qdisc) {
3477                spin_lock(qdisc_lock(q));
3478                if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3479                        result = qdisc_enqueue_root(skb, q);
3480                spin_unlock(qdisc_lock(q));
3481        }
3482
3483        return result;
3484}
3485
3486static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3487                                         struct packet_type **pt_prev,
3488                                         int *ret, struct net_device *orig_dev)
3489{
3490        struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3491
3492        if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
3493                goto out;
3494
3495        if (*pt_prev) {
3496                *ret = deliver_skb(skb, *pt_prev, orig_dev);
3497                *pt_prev = NULL;
3498        }
3499
3500        switch (ing_filter(skb, rxq)) {
3501        case TC_ACT_SHOT:
3502        case TC_ACT_STOLEN:
3503                kfree_skb(skb);
3504                return NULL;
3505        }
3506
3507out:
3508        skb->tc_verd = 0;
3509        return skb;
3510}
3511#endif
3512
3513/**
3514 *      netdev_rx_handler_register - register receive handler
3515 *      @dev: device to register a handler for
3516 *      @rx_handler: receive handler to register
3517 *      @rx_handler_data: data pointer that is used by rx handler
3518 *
3519 *      Register a receive handler for a device. This handler will then be
3520 *      called from __netif_receive_skb. A negative errno code is returned
3521 *      on a failure.
3522 *
3523 *      The caller must hold the rtnl_mutex.
3524 *
3525 *      For a general description of rx_handler, see enum rx_handler_result.
3526 */
3527int netdev_rx_handler_register(struct net_device *dev,
3528                               rx_handler_func_t *rx_handler,
3529                               void *rx_handler_data)
3530{
3531        ASSERT_RTNL();
3532
3533        if (dev->rx_handler)
3534                return -EBUSY;
3535
3536        /* Note: rx_handler_data must be set before rx_handler */
3537        rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3538        rcu_assign_pointer(dev->rx_handler, rx_handler);
3539
3540        return 0;
3541}
3542EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3543
3544/**
3545 *      netdev_rx_handler_unregister - unregister receive handler
3546 *      @dev: device to unregister a handler from
3547 *
3548 *      Unregister a receive handler from a device.
3549 *
3550 *      The caller must hold the rtnl_mutex.
3551 */
3552void netdev_rx_handler_unregister(struct net_device *dev)
3553{
3554
3555        ASSERT_RTNL();
3556        RCU_INIT_POINTER(dev->rx_handler, NULL);
3557        /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3558         * section has a guarantee to see a non NULL rx_handler_data
3559         * as well.
3560         */
3561        synchronize_net();
3562        RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3563}
3564EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3565
3566/*
3567 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3568 * the special handling of PFMEMALLOC skbs.
3569 */
3570static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3571{
3572        switch (skb->protocol) {
3573        case htons(ETH_P_ARP):
3574        case htons(ETH_P_IP):
3575        case htons(ETH_P_IPV6):
3576        case htons(ETH_P_8021Q):
3577        case htons(ETH_P_8021AD):
3578                return true;
3579        default:
3580                return false;
3581        }
3582}
3583
3584static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3585{
3586        struct packet_type *ptype, *pt_prev;
3587        rx_handler_func_t *rx_handler;
3588        struct net_device *orig_dev;
3589        struct net_device *null_or_dev;
3590        bool deliver_exact = false;
3591        int ret = NET_RX_DROP;
3592        __be16 type;
3593
3594        net_timestamp_check(!netdev_tstamp_prequeue, skb);
3595
3596        trace_netif_receive_skb(skb);
3597
3598        orig_dev = skb->dev;
3599
3600        skb_reset_network_header(skb);
3601        if (!skb_transport_header_was_set(skb))
3602                skb_reset_transport_header(skb);
3603        skb_reset_mac_len(skb);
3604
3605        pt_prev = NULL;
3606
3607        rcu_read_lock();
3608
3609another_round:
3610        skb->skb_iif = skb->dev->ifindex;
3611
3612        __this_cpu_inc(softnet_data.processed);
3613
3614        if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3615            skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3616                skb = skb_vlan_untag(skb);
3617                if (unlikely(!skb))
3618                        goto unlock;
3619        }
3620
3621#ifdef CONFIG_NET_CLS_ACT
3622        if (skb->tc_verd & TC_NCLS) {
3623                skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3624                goto ncls;
3625        }
3626#endif
3627
3628        if (pfmemalloc)
3629                goto skip_taps;
3630
3631        list_for_each_entry_rcu(ptype, &ptype_all, list) {
3632                if (!ptype->dev || ptype->dev == skb->dev) {
3633                        if (pt_prev)
3634                                ret = deliver_skb(skb, pt_prev, orig_dev);
3635                        pt_prev = ptype;
3636                }
3637        }
3638
3639skip_taps:
3640#ifdef CONFIG_NET_CLS_ACT
3641        skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3642        if (!skb)
3643                goto unlock;
3644ncls:
3645#endif
3646
3647        if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3648                goto drop;
3649
3650        if (vlan_tx_tag_present(skb)) {
3651                if (pt_prev) {
3652                        ret = deliver_skb(skb, pt_prev, orig_dev);
3653                        pt_prev = NULL;
3654                }
3655                if (vlan_do_receive(&skb))
3656                        goto another_round;
3657                else if (unlikely(!skb))
3658                        goto unlock;
3659        }
3660
3661        rx_handler = rcu_dereference(skb->dev->rx_handler);
3662        if (rx_handler) {
3663                if (pt_prev) {
3664                        ret = deliver_skb(skb, pt_prev, orig_dev);
3665                        pt_prev = NULL;
3666                }
3667                switch (rx_handler(&skb)) {
3668                case RX_HANDLER_CONSUMED:
3669                        ret = NET_RX_SUCCESS;
3670                        goto unlock;
3671                case RX_HANDLER_ANOTHER:
3672                        goto another_round;
3673                case RX_HANDLER_EXACT:
3674                        deliver_exact = true;
3675                case RX_HANDLER_PASS:
3676                        break;
3677                default:
3678                        BUG();
3679                }
3680        }
3681
3682        if (unlikely(vlan_tx_tag_present(skb))) {
3683                if (vlan_tx_tag_get_id(skb))
3684                        skb->pkt_type = PACKET_OTHERHOST;
3685                /* Note: we might in the future use prio bits
3686                 * and set skb->priority like in vlan_do_receive()
3687                 * For the time being, just ignore Priority Code Point
3688                 */
3689                skb->vlan_tci = 0;
3690        }
3691
3692        /* deliver only exact match when indicated */
3693        null_or_dev = deliver_exact ? skb->dev : NULL;
3694
3695        type = skb->protocol;
3696        list_for_each_entry_rcu(ptype,
3697                        &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3698                if (ptype->type == type &&
3699                    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3700                     ptype->dev == orig_dev)) {
3701                        if (pt_prev)
3702                                ret = deliver_skb(skb, pt_prev, orig_dev);
3703                        pt_prev = ptype;
3704                }
3705        }
3706
3707        if (pt_prev) {
3708                if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3709                        goto drop;
3710                else
3711                        ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3712        } else {
3713drop:
3714                atomic_long_inc(&skb->dev->rx_dropped);
3715                kfree_skb(skb);
3716                /* Jamal, now you will not able to escape explaining
3717                 * me how you were going to use this. :-)
3718                 */
3719                ret = NET_RX_DROP;
3720        }
3721
3722unlock:
3723        rcu_read_unlock();
3724        return ret;
3725}
3726
3727static int __netif_receive_skb(struct sk_buff *skb)
3728{
3729        int ret;
3730
3731        if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3732                unsigned long pflags = current->flags;
3733
3734                /*
3735                 * PFMEMALLOC skbs are special, they should
3736                 * - be delivered to SOCK_MEMALLOC sockets only
3737                 * - stay away from userspace
3738                 * - have bounded memory usage
3739                 *
3740                 * Use PF_MEMALLOC as this saves us from propagating the allocation
3741                 * context down to all allocation sites.
3742                 */
3743                current->flags |= PF_MEMALLOC;
3744                ret = __netif_receive_skb_core(skb, true);
3745                tsk_restore_flags(current, pflags, PF_MEMALLOC);
3746        } else
3747                ret = __netif_receive_skb_core(skb, false);
3748
3749        return ret;
3750}
3751
3752static int netif_receive_skb_internal(struct sk_buff *skb)
3753{
3754        net_timestamp_check(netdev_tstamp_prequeue, skb);
3755
3756        if (skb_defer_rx_timestamp(skb))
3757                return NET_RX_SUCCESS;
3758
3759#ifdef CONFIG_RPS
3760        if (static_key_false(&rps_needed)) {
3761                struct rps_dev_flow voidflow, *rflow = &voidflow;
3762                int cpu, ret;
3763
3764                rcu_read_lock();
3765
3766                cpu = get_rps_cpu(skb->dev, skb, &rflow);
3767
3768                if (cpu >= 0) {
3769                        ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3770                        rcu_read_unlock();
3771                        return ret;
3772                }
3773                rcu_read_unlock();
3774        }
3775#endif
3776        return __netif_receive_skb(skb);
3777}
3778
3779/**
3780 *      netif_receive_skb - process receive buffer from network
3781 *      @skb: buffer to process
3782 *
3783 *      netif_receive_skb() is the main receive data processing function.
3784 *      It always succeeds. The buffer may be dropped during processing
3785 *      for congestion control or by the protocol layers.
3786 *
3787 *      This function may only be called from softirq context and interrupts
3788 *      should be enabled.
3789 *
3790 *      Return values (usually ignored):
3791 *      NET_RX_SUCCESS: no congestion
3792 *      NET_RX_DROP: packet was dropped
3793 */
3794int netif_receive_skb(struct sk_buff *skb)
3795{
3796        trace_netif_receive_skb_entry(skb);
3797
3798        return netif_receive_skb_internal(skb);
3799}
3800EXPORT_SYMBOL(netif_receive_skb);
3801
3802/* Network device is going away, flush any packets still pending
3803 * Called with irqs disabled.
3804 */
3805static void flush_backlog(void *arg)
3806{
3807        struct net_device *dev = arg;
3808        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3809        struct sk_buff *skb, *tmp;
3810
3811        rps_lock(sd);
3812        skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3813                if (skb->dev == dev) {
3814                        __skb_unlink(skb, &sd->input_pkt_queue);
3815                        kfree_skb(skb);
3816                        input_queue_head_incr(sd);
3817                }
3818        }
3819        rps_unlock(sd);
3820
3821        skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3822                if (skb->dev == dev) {
3823                        __skb_unlink(skb, &sd->process_queue);
3824                        kfree_skb(skb);
3825                        input_queue_head_incr(sd);
3826                }
3827        }
3828}
3829
3830static int napi_gro_complete(struct sk_buff *skb)
3831{
3832        struct packet_offload *ptype;
3833        __be16 type = skb->protocol;
3834        struct list_head *head = &offload_base;
3835        int err = -ENOENT;
3836
3837        BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3838
3839        if (NAPI_GRO_CB(skb)->count == 1) {
3840                skb_shinfo(skb)->gso_size = 0;
3841                goto out;
3842        }
3843
3844        rcu_read_lock();
3845        list_for_each_entry_rcu(ptype, head, list) {
3846                if (ptype->type != type || !ptype->callbacks.gro_complete)
3847                        continue;
3848
3849                err = ptype->callbacks.gro_complete(skb, 0);
3850                break;
3851        }
3852        rcu_read_unlock();
3853
3854        if (err) {
3855                WARN_ON(&ptype->list == head);
3856                kfree_skb(skb);
3857                return NET_RX_SUCCESS;
3858        }
3859
3860out:
3861        return netif_receive_skb_internal(skb);
3862}
3863
3864/* napi->gro_list contains packets ordered by age.
3865 * youngest packets at the head of it.
3866 * Complete skbs in reverse order to reduce latencies.
3867 */
3868void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3869{
3870        struct sk_buff *skb, *prev = NULL;
3871
3872        /* scan list and build reverse chain */
3873        for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3874                skb->prev = prev;
3875                prev = skb;
3876        }
3877
3878        for (skb = prev; skb; skb = prev) {
3879                skb->next = NULL;
3880
3881                if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3882                        return;
3883
3884                prev = skb->prev;
3885                napi_gro_complete(skb);
3886                napi->gro_count--;
3887        }
3888
3889        napi->gro_list = NULL;
3890}
3891EXPORT_SYMBOL(napi_gro_flush);
3892
3893static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3894{
3895        struct sk_buff *p;
3896        unsigned int maclen = skb->dev->hard_header_len;
3897        u32 hash = skb_get_hash_raw(skb);
3898
3899        for (p = napi->gro_list; p; p = p->next) {
3900                unsigned long diffs;
3901
3902                NAPI_GRO_CB(p)->flush = 0;
3903
3904                if (hash != skb_get_hash_raw(p)) {
3905                        NAPI_GRO_CB(p)->same_flow = 0;
3906                        continue;
3907                }
3908
3909                diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3910                diffs |= p->vlan_tci ^ skb->vlan_tci;
3911                if (maclen == ETH_HLEN)
3912                        diffs |= compare_ether_header(skb_mac_header(p),
3913                                                      skb_mac_header(skb));
3914                else if (!diffs)
3915                        diffs = memcmp(skb_mac_header(p),
3916                                       skb_mac_header(skb),
3917                                       maclen);
3918                NAPI_GRO_CB(p)->same_flow = !diffs;
3919        }
3920}
3921
3922static void skb_gro_reset_offset(struct sk_buff *skb)
3923{
3924        const struct skb_shared_info *pinfo = skb_shinfo(skb);
3925        const skb_frag_t *frag0 = &pinfo->frags[0];
3926
3927        NAPI_GRO_CB(skb)->data_offset = 0;
3928        NAPI_GRO_CB(skb)->frag0 = NULL;
3929        NAPI_GRO_CB(skb)->frag0_len = 0;
3930
3931        if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3932            pinfo->nr_frags &&
3933            !PageHighMem(skb_frag_page(frag0))) {
3934                NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3935                NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3936        }
3937}
3938
3939static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3940{
3941        struct skb_shared_info *pinfo = skb_shinfo(skb);
3942
3943        BUG_ON(skb->end - skb->tail < grow);
3944
3945        memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3946
3947        skb->data_len -= grow;
3948        skb->tail += grow;
3949
3950        pinfo->frags[0].page_offset += grow;
3951        skb_frag_size_sub(&pinfo->frags[0], grow);
3952
3953        if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3954                skb_frag_unref(skb, 0);
3955                memmove(pinfo->frags, pinfo->frags + 1,
3956                        --pinfo->nr_frags * sizeof(pinfo->frags[0]));
3957        }
3958}
3959
3960static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3961{
3962        struct sk_buff **pp = NULL;
3963        struct packet_offload *ptype;
3964        __be16 type = skb->protocol;
3965        struct list_head *head = &offload_base;
3966        int same_flow;
3967        enum gro_result ret;
3968        int grow;
3969
3970        if (!(skb->dev->features & NETIF_F_GRO))
3971                goto normal;
3972
3973        if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
3974                goto normal;
3975
3976        gro_list_prepare(napi, skb);
3977
3978        rcu_read_lock();
3979        list_for_each_entry_rcu(ptype, head, list) {
3980                if (ptype->type != type || !ptype->callbacks.gro_receive)
3981                        continue;
3982
3983                skb_set_network_header(skb, skb_gro_offset(skb));
3984                skb_reset_mac_len(skb);
3985                NAPI_GRO_CB(skb)->same_flow = 0;
3986                NAPI_GRO_CB(skb)->flush = 0;
3987                NAPI_GRO_CB(skb)->free = 0;
3988                NAPI_GRO_CB(skb)->udp_mark = 0;
3989
3990                /* Setup for GRO checksum validation */
3991                switch (skb->ip_summed) {
3992                case CHECKSUM_COMPLETE:
3993                        NAPI_GRO_CB(skb)->csum = skb->csum;
3994                        NAPI_GRO_CB(skb)->csum_valid = 1;
3995                        NAPI_GRO_CB(skb)->csum_cnt = 0;
3996                        break;
3997                case CHECKSUM_UNNECESSARY:
3998                        NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
3999                        NAPI_GRO_CB(skb)->csum_valid = 0;
4000                        break;
4001                default:
4002                        NAPI_GRO_CB(skb)->csum_cnt = 0;
4003                        NAPI_GRO_CB(skb)->csum_valid = 0;
4004                }
4005
4006                pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4007                break;
4008        }
4009        rcu_read_unlock();
4010
4011        if (&ptype->list == head)
4012                goto normal;
4013
4014        same_flow = NAPI_GRO_CB(skb)->same_flow;
4015        ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4016
4017        if (pp) {
4018                struct sk_buff *nskb = *pp;
4019
4020                *pp = nskb->next;
4021                nskb->next = NULL;
4022                napi_gro_complete(nskb);
4023                napi->gro_count--;
4024        }
4025
4026        if (same_flow)
4027                goto ok;
4028
4029        if (NAPI_GRO_CB(skb)->flush)
4030                goto normal;
4031
4032        if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4033                struct sk_buff *nskb = napi->gro_list;
4034
4035                /* locate the end of the list to select the 'oldest' flow */
4036                while (nskb->next) {
4037                        pp = &nskb->next;
4038                        nskb = *pp;
4039                }
4040                *pp = NULL;
4041                nskb->next = NULL;
4042                napi_gro_complete(nskb);
4043        } else {
4044                napi->gro_count++;
4045        }
4046        NAPI_GRO_CB(skb)->count = 1;
4047        NAPI_GRO_CB(skb)->age = jiffies;
4048        NAPI_GRO_CB(skb)->last = skb;
4049        skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4050        skb->next = napi->gro_list;
4051        napi->gro_list = skb;
4052        ret = GRO_HELD;
4053
4054pull:
4055        grow = skb_gro_offset(skb) - skb_headlen(skb);
4056        if (grow > 0)
4057                gro_pull_from_frag0(skb, grow);
4058ok:
4059        return ret;
4060
4061normal:
4062        ret = GRO_NORMAL;
4063        goto pull;
4064}
4065
4066struct packet_offload *gro_find_receive_by_type(__be16 type)
4067{
4068        struct list_head *offload_head = &offload_base;
4069        struct packet_offload *ptype;
4070
4071        list_for_each_entry_rcu(ptype, offload_head, list) {
4072                if (ptype->type != type || !ptype->callbacks.gro_receive)
4073                        continue;
4074                return ptype;
4075        }
4076        return NULL;
4077}
4078EXPORT_SYMBOL(gro_find_receive_by_type);
4079
4080struct packet_offload *gro_find_complete_by_type(__be16 type)
4081{
4082        struct list_head *offload_head = &offload_base;
4083        struct packet_offload *ptype;
4084
4085        list_for_each_entry_rcu(ptype, offload_head, list) {
4086                if (ptype->type != type || !ptype->callbacks.gro_complete)
4087                        continue;
4088                return ptype;
4089        }
4090        return NULL;
4091}
4092EXPORT_SYMBOL(gro_find_complete_by_type);
4093
4094static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4095{
4096        switch (ret) {
4097        case GRO_NORMAL:
4098                if (netif_receive_skb_internal(skb))
4099                        ret = GRO_DROP;
4100                break;
4101
4102        case GRO_DROP:
4103                kfree_skb(skb);
4104                break;
4105
4106        case GRO_MERGED_FREE:
4107                if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4108                        kmem_cache_free(skbuff_head_cache, skb);
4109                else
4110                        __kfree_skb(skb);
4111                break;
4112
4113        case GRO_HELD:
4114        case GRO_MERGED:
4115                break;
4116        }
4117
4118        return ret;
4119}
4120
4121gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4122{
4123        trace_napi_gro_receive_entry(skb);
4124
4125        skb_gro_reset_offset(skb);
4126
4127        return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4128}
4129EXPORT_SYMBOL(napi_gro_receive);
4130
4131static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4132{
4133        if (unlikely(skb->pfmemalloc)) {
4134                consume_skb(skb);
4135                return;
4136        }
4137        __skb_pull(skb, skb_headlen(skb));
4138        /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4139        skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4140        skb->vlan_tci = 0;
4141        skb->dev = napi->dev;
4142        skb->skb_iif = 0;
4143        skb->encapsulation = 0;
4144        skb_shinfo(skb)->gso_type = 0;
4145        skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4146
4147        napi->skb = skb;
4148}
4149
4150struct sk_buff *napi_get_frags(struct napi_struct *napi)
4151{
4152        struct sk_buff *skb = napi->skb;
4153
4154        if (!skb) {
4155                skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4156                napi->skb = skb;
4157        }
4158        return skb;
4159}
4160EXPORT_SYMBOL(napi_get_frags);
4161
4162static gro_result_t napi_frags_finish(struct napi_struct *napi,
4163                                      struct sk_buff *skb,
4164                                      gro_result_t ret)
4165{
4166        switch (ret) {
4167        case GRO_NORMAL:
4168        case GRO_HELD:
4169                __skb_push(skb, ETH_HLEN);
4170                skb->protocol = eth_type_trans(skb, skb->dev);
4171                if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4172                        ret = GRO_DROP;
4173                break;
4174
4175        case GRO_DROP:
4176        case GRO_MERGED_FREE:
4177                napi_reuse_skb(napi, skb);
4178                break;
4179
4180        case GRO_MERGED:
4181                break;
4182        }
4183
4184        return ret;
4185}
4186
4187/* Upper GRO stack assumes network header starts at gro_offset=0
4188 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4189 * We copy ethernet header into skb->data to have a common layout.
4190 */
4191static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4192{
4193        struct sk_buff *skb = napi->skb;
4194        const struct ethhdr *eth;
4195        unsigned int hlen = sizeof(*eth);
4196
4197        napi->skb = NULL;
4198
4199        skb_reset_mac_header(skb);
4200        skb_gro_reset_offset(skb);
4201
4202        eth = skb_gro_header_fast(skb, 0);
4203        if (unlikely(skb_gro_header_hard(skb, hlen))) {
4204                eth = skb_gro_header_slow(skb, hlen, 0);
4205                if (unlikely(!eth)) {
4206                        napi_reuse_skb(napi, skb);
4207                        return NULL;
4208                }
4209        } else {
4210                gro_pull_from_frag0(skb, hlen);
4211                NAPI_GRO_CB(skb)->frag0 += hlen;
4212                NAPI_GRO_CB(skb)->frag0_len -= hlen;
4213        }
4214        __skb_pull(skb, hlen);
4215
4216        /*
4217         * This works because the only protocols we care about don't require
4218         * special handling.
4219         * We'll fix it up properly in napi_frags_finish()
4220         */
4221        skb->protocol = eth->h_proto;
4222
4223        return skb;
4224}
4225
4226gro_result_t napi_gro_frags(struct napi_struct *napi)
4227{
4228        struct sk_buff *skb = napi_frags_skb(napi);
4229
4230        if (!skb)
4231                return GRO_DROP;
4232
4233        trace_napi_gro_frags_entry(skb);
4234
4235        return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4236}
4237EXPORT_SYMBOL(napi_gro_frags);
4238
4239/* Compute the checksum from gro_offset and return the folded value
4240 * after adding in any pseudo checksum.
4241 */
4242__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4243{
4244        __wsum wsum;
4245        __sum16 sum;
4246
4247        wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4248
4249        /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4250        sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4251        if (likely(!sum)) {
4252                if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4253                    !skb->csum_complete_sw)
4254                        netdev_rx_csum_fault(skb->dev);
4255        }
4256
4257        NAPI_GRO_CB(skb)->csum = wsum;
4258        NAPI_GRO_CB(skb)->csum_valid = 1;
4259
4260        return sum;
4261}
4262EXPORT_SYMBOL(__skb_gro_checksum_complete);
4263
4264/*
4265 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4266 * Note: called with local irq disabled, but exits with local irq enabled.
4267 */
4268static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4269{
4270#ifdef CONFIG_RPS
4271        struct softnet_data *remsd = sd->rps_ipi_list;
4272
4273        if (remsd) {
4274                sd->rps_ipi_list = NULL;
4275
4276                local_irq_enable();
4277
4278                /* Send pending IPI's to kick RPS processing on remote cpus. */
4279                while (remsd) {
4280                        struct softnet_data *next = remsd->rps_ipi_next;
4281
4282                        if (cpu_online(remsd->cpu))
4283                                smp_call_function_single_async(remsd->cpu,
4284                                                           &remsd->csd);
4285                        remsd = next;
4286                }
4287        } else
4288#endif
4289                local_irq_enable();
4290}
4291
4292static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4293{
4294#ifdef CONFIG_RPS
4295        return sd->rps_ipi_list != NULL;
4296#else
4297        return false;
4298#endif
4299}
4300
4301static int process_backlog(struct napi_struct *napi, int quota)
4302{
4303        int work = 0;
4304        struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4305
4306        /* Check if we have pending ipi, its better to send them now,
4307         * not waiting net_rx_action() end.
4308         */
4309        if (sd_has_rps_ipi_waiting(sd)) {
4310                local_irq_disable();
4311                net_rps_action_and_irq_enable(sd);
4312        }
4313
4314        napi->weight = weight_p;
4315        local_irq_disable();
4316        while (1) {
4317                struct sk_buff *skb;
4318
4319                while ((skb = __skb_dequeue(&sd->process_queue))) {
4320                        local_irq_enable();
4321                        __netif_receive_skb(skb);
4322                        local_irq_disable();
4323                        input_queue_head_incr(sd);
4324                        if (++work >= quota) {
4325                                local_irq_enable();
4326                                return work;
4327                        }
4328                }
4329
4330                rps_lock(sd);
4331                if (skb_queue_empty(&sd->input_pkt_queue)) {
4332                        /*
4333                         * Inline a custom version of __napi_complete().
4334                         * only current cpu owns and manipulates this napi,
4335                         * and NAPI_STATE_SCHED is the only possible flag set
4336                         * on backlog.
4337                         * We can use a plain write instead of clear_bit(),
4338                         * and we dont need an smp_mb() memory barrier.
4339                         */
4340                        napi->state = 0;
4341                        rps_unlock(sd);
4342
4343                        break;
4344                }
4345
4346                skb_queue_splice_tail_init(&sd->input_pkt_queue,
4347                                           &sd->process_queue);
4348                rps_unlock(sd);
4349        }
4350        local_irq_enable();
4351
4352        return work;
4353}
4354
4355/**
4356 * __napi_schedule - schedule for receive
4357 * @n: entry to schedule
4358 *
4359 * The entry's receive function will be scheduled to run.
4360 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4361 */
4362void __napi_schedule(struct napi_struct *n)
4363{
4364        unsigned long flags;
4365
4366        local_irq_save(flags);
4367        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4368        local_irq_restore(flags);
4369}
4370EXPORT_SYMBOL(__napi_schedule);
4371
4372/**
4373 * __napi_schedule_irqoff - schedule for receive
4374 * @n: entry to schedule
4375 *
4376 * Variant of __napi_schedule() assuming hard irqs are masked
4377 */
4378void __napi_schedule_irqoff(struct napi_struct *n)
4379{
4380        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4381}
4382EXPORT_SYMBOL(__napi_schedule_irqoff);
4383
4384void __napi_complete(struct napi_struct *n)
4385{
4386        BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4387
4388        list_del_init(&n->poll_list);
4389        smp_mb__before_atomic();
4390        clear_bit(NAPI_STATE_SCHED, &n->state);
4391}
4392EXPORT_SYMBOL(__napi_complete);
4393
4394void napi_complete_done(struct napi_struct *n, int work_done)
4395{
4396        unsigned long flags;
4397
4398        /*
4399         * don't let napi dequeue from the cpu poll list
4400         * just in case its running on a different cpu
4401         */
4402        if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4403                return;
4404
4405        if (n->gro_list) {
4406                unsigned long timeout = 0;
4407
4408                if (work_done)
4409                        timeout = n->dev->gro_flush_timeout;
4410
4411                if (timeout)
4412                        hrtimer_start(&n->timer, ns_to_ktime(timeout),
4413                                      HRTIMER_MODE_REL_PINNED);
4414                else
4415                        napi_gro_flush(n, false);
4416        }
4417        if (likely(list_empty(&n->poll_list))) {
4418                WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4419        } else {
4420                /* If n->poll_list is not empty, we need to mask irqs */
4421                local_irq_save(flags);
4422                __napi_complete(n);
4423                local_irq_restore(flags);
4424        }
4425}
4426EXPORT_SYMBOL(napi_complete_done);
4427
4428/* must be called under rcu_read_lock(), as we dont take a reference */
4429struct napi_struct *napi_by_id(unsigned int napi_id)
4430{
4431        unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4432        struct napi_struct *napi;
4433
4434        hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4435                if (napi->napi_id == napi_id)
4436                        return napi;
4437
4438        return NULL;
4439}
4440EXPORT_SYMBOL_GPL(napi_by_id);
4441
4442void napi_hash_add(struct napi_struct *napi)
4443{
4444        if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4445
4446                spin_lock(&napi_hash_lock);
4447
4448                /* 0 is not a valid id, we also skip an id that is taken
4449                 * we expect both events to be extremely rare
4450                 */
4451                napi->napi_id = 0;
4452                while (!napi->napi_id) {
4453                        napi->napi_id = ++napi_gen_id;
4454                        if (napi_by_id(napi->napi_id))
4455                                napi->napi_id = 0;
4456                }
4457
4458                hlist_add_head_rcu(&napi->napi_hash_node,
4459                        &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4460
4461                spin_unlock(&napi_hash_lock);
4462        }
4463}
4464EXPORT_SYMBOL_GPL(napi_hash_add);
4465
4466/* Warning : caller is responsible to make sure rcu grace period
4467 * is respected before freeing memory containing @napi
4468 */
4469void napi_hash_del(struct napi_struct *napi)
4470{
4471        spin_lock(&napi_hash_lock);
4472
4473        if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4474                hlist_del_rcu(&napi->napi_hash_node);
4475
4476        spin_unlock(&napi_hash_lock);
4477}
4478EXPORT_SYMBOL_GPL(napi_hash_del);
4479
4480static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4481{
4482        struct napi_struct *napi;
4483
4484        napi = container_of(timer, struct napi_struct, timer);
4485        if (napi->gro_list)
4486                napi_schedule(napi);
4487
4488        return HRTIMER_NORESTART;
4489}
4490
4491void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4492                    int (*poll)(struct napi_struct *, int), int weight)
4493{
4494        INIT_LIST_HEAD(&napi->poll_list);
4495        hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4496        napi->timer.function = napi_watchdog;
4497        napi->gro_count = 0;
4498        napi->gro_list = NULL;
4499        napi->skb = NULL;
4500        napi->poll = poll;
4501        if (weight > NAPI_POLL_WEIGHT)
4502                pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4503                            weight, dev->name);
4504        napi->weight = weight;
4505        list_add(&napi->dev_list, &dev->napi_list);
4506        napi->dev = dev;
4507#ifdef CONFIG_NETPOLL
4508        spin_lock_init(&napi->poll_lock);
4509        napi->poll_owner = -1;
4510#endif
4511        set_bit(NAPI_STATE_SCHED, &napi->state);
4512}
4513EXPORT_SYMBOL(netif_napi_add);
4514
4515void napi_disable(struct napi_struct *n)
4516{
4517        might_sleep();
4518        set_bit(NAPI_STATE_DISABLE, &n->state);
4519
4520        while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4521                msleep(1);
4522
4523        hrtimer_cancel(&n->timer);
4524
4525        clear_bit(NAPI_STATE_DISABLE, &n->state);
4526}
4527EXPORT_SYMBOL(napi_disable);
4528
4529void netif_napi_del(struct napi_struct *napi)
4530{
4531        list_del_init(&napi->dev_list);
4532        napi_free_frags(napi);
4533
4534        kfree_skb_list(napi->gro_list);
4535        napi->gro_list = NULL;
4536        napi->gro_count = 0;
4537}
4538EXPORT_SYMBOL(netif_napi_del);
4539
4540static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4541{
4542        void *have;
4543        int work, weight;
4544
4545        list_del_init(&n->poll_list);
4546
4547        have = netpoll_poll_lock(n);
4548
4549        weight = n->weight;
4550
4551        /* This NAPI_STATE_SCHED test is for avoiding a race
4552         * with netpoll's poll_napi().  Only the entity which
4553         * obtains the lock and sees NAPI_STATE_SCHED set will
4554         * actually make the ->poll() call.  Therefore we avoid
4555         * accidentally calling ->poll() when NAPI is not scheduled.
4556         */
4557        work = 0;
4558        if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4559                work = n->poll(n, weight);
4560                trace_napi_poll(n);
4561        }
4562
4563        WARN_ON_ONCE(work > weight);
4564
4565        if (likely(work < weight))
4566                goto out_unlock;
4567
4568        /* Drivers must not modify the NAPI state if they
4569         * consume the entire weight.  In such cases this code
4570         * still "owns" the NAPI instance and therefore can
4571         * move the instance around on the list at-will.
4572         */
4573        if (unlikely(napi_disable_pending(n))) {
4574                napi_complete(n);
4575                goto out_unlock;
4576        }
4577
4578        if (n->gro_list) {
4579                /* flush too old packets
4580                 * If HZ < 1000, flush all packets.
4581                 */
4582                napi_gro_flush(n, HZ >= 1000);
4583        }
4584
4585        /* Some drivers may have called napi_schedule
4586         * prior to exhausting their budget.
4587         */
4588        if (unlikely(!list_empty(&n->poll_list))) {
4589                pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4590                             n->dev ? n->dev->name : "backlog");
4591                goto out_unlock;
4592        }
4593
4594        list_add_tail(&n->poll_list, repoll);
4595
4596out_unlock:
4597        netpoll_poll_unlock(have);
4598
4599        return work;
4600}
4601
4602static void net_rx_action(struct softirq_action *h)
4603{
4604        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4605        unsigned long time_limit = jiffies + 2;
4606        int budget = netdev_budget;
4607        LIST_HEAD(list);
4608        LIST_HEAD(repoll);
4609
4610        local_irq_disable();
4611        list_splice_init(&sd->poll_list, &list);
4612        local_irq_enable();
4613
4614        for (;;) {
4615                struct napi_struct *n;
4616
4617                if (list_empty(&list)) {
4618                        if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4619                                return;
4620                        break;
4621                }
4622
4623                n = list_first_entry(&list, struct napi_struct, poll_list);
4624                budget -= napi_poll(n, &repoll);
4625
4626                /* If softirq window is exhausted then punt.
4627                 * Allow this to run for 2 jiffies since which will allow
4628                 * an average latency of 1.5/HZ.
4629                 */
4630                if (unlikely(budget <= 0 ||
4631                             time_after_eq(jiffies, time_limit))) {
4632                        sd->time_squeeze++;
4633                        break;
4634                }
4635        }
4636
4637        local_irq_disable();
4638
4639        list_splice_tail_init(&sd->poll_list, &list);
4640        list_splice_tail(&repoll, &list);
4641        list_splice(&list, &sd->poll_list);
4642        if (!list_empty(&sd->poll_list))
4643                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4644
4645        net_rps_action_and_irq_enable(sd);
4646}
4647
4648struct netdev_adjacent {
4649        struct net_device *dev;
4650
4651        /* upper master flag, there can only be one master device per list */
4652        bool master;
4653
4654        /* counter for the number of times this device was added to us */
4655        u16 ref_nr;
4656
4657        /* private field for the users */
4658        void *private;
4659
4660        struct list_head list;
4661        struct rcu_head rcu;
4662};
4663
4664static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4665                                                 struct net_device *adj_dev,
4666                                                 struct list_head *adj_list)
4667{
4668        struct netdev_adjacent *adj;
4669
4670        list_for_each_entry(adj, adj_list, list) {
4671                if (adj->dev == adj_dev)
4672                        return adj;
4673        }
4674        return NULL;
4675}
4676
4677/**
4678 * netdev_has_upper_dev - Check if device is linked to an upper device
4679 * @dev: device
4680 * @upper_dev: upper device to check
4681 *
4682 * Find out if a device is linked to specified upper device and return true
4683 * in case it is. Note that this checks only immediate upper device,
4684 * not through a complete stack of devices. The caller must hold the RTNL lock.
4685 */
4686bool netdev_has_upper_dev(struct net_device *dev,
4687                          struct net_device *upper_dev)
4688{
4689        ASSERT_RTNL();
4690
4691        return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4692}
4693EXPORT_SYMBOL(netdev_has_upper_dev);
4694
4695/**
4696 * netdev_has_any_upper_dev - Check if device is linked to some device
4697 * @dev: device
4698 *
4699 * Find out if a device is linked to an upper device and return true in case
4700 * it is. The caller must hold the RTNL lock.
4701 */
4702static bool netdev_has_any_upper_dev(struct net_device *dev)
4703{
4704        ASSERT_RTNL();
4705
4706        return !list_empty(&dev->all_adj_list.upper);
4707}
4708
4709/**
4710 * netdev_master_upper_dev_get - Get master upper device
4711 * @dev: device
4712 *
4713 * Find a master upper device and return pointer to it or NULL in case
4714 * it's not there. The caller must hold the RTNL lock.
4715 */
4716struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4717{
4718        struct netdev_adjacent *upper;
4719
4720        ASSERT_RTNL();
4721
4722        if (list_empty(&dev->adj_list.upper))
4723                return NULL;
4724
4725        upper = list_first_entry(&dev->adj_list.upper,
4726                                 struct netdev_adjacent, list);
4727        if (likely(upper->master))
4728                return upper->dev;
4729        return NULL;
4730}
4731EXPORT_SYMBOL(netdev_master_upper_dev_get);
4732
4733void *netdev_adjacent_get_private(struct list_head *adj_list)
4734{
4735        struct netdev_adjacent *adj;
4736
4737        adj = list_entry(adj_list, struct netdev_adjacent, list);
4738
4739        return adj->private;
4740}
4741EXPORT_SYMBOL(netdev_adjacent_get_private);
4742
4743/**
4744 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4745 * @dev: device
4746 * @iter: list_head ** of the current position
4747 *
4748 * Gets the next device from the dev's upper list, starting from iter
4749 * position. The caller must hold RCU read lock.
4750 */
4751struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4752                                                 struct list_head **iter)
4753{
4754        struct netdev_adjacent *upper;
4755
4756        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4757
4758        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4759
4760        if (&upper->list == &dev->adj_list.upper)
4761                return NULL;
4762
4763        *iter = &upper->list;
4764
4765        return upper->dev;
4766}
4767EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4768
4769/**
4770 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4771 * @dev: device
4772 * @iter: list_head ** of the current position
4773 *
4774 * Gets the next device from the dev's upper list, starting from iter
4775 * position. The caller must hold RCU read lock.
4776 */
4777struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4778                                                     struct list_head **iter)
4779{
4780        struct netdev_adjacent *upper;
4781
4782        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4783
4784        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4785
4786        if (&upper->list == &dev->all_adj_list.upper)
4787                return NULL;
4788
4789        *iter = &upper->list;
4790
4791        return upper->dev;
4792}
4793EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4794
4795/**
4796 * netdev_lower_get_next_private - Get the next ->private from the
4797 *                                 lower neighbour list
4798 * @dev: device
4799 * @iter: list_head ** of the current position
4800 *
4801 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4802 * list, starting from iter position. The caller must hold either hold the
4803 * RTNL lock or its own locking that guarantees that the neighbour lower
4804 * list will remain unchainged.
4805 */
4806void *netdev_lower_get_next_private(struct net_device *dev,
4807                                    struct list_head **iter)
4808{
4809        struct netdev_adjacent *lower;
4810
4811        lower = list_entry(*iter, struct netdev_adjacent, list);
4812
4813        if (&lower->list == &dev->adj_list.lower)
4814                return NULL;
4815
4816        *iter = lower->list.next;
4817
4818        return lower->private;
4819}
4820EXPORT_SYMBOL(netdev_lower_get_next_private);
4821
4822/**
4823 * netdev_lower_get_next_private_rcu - Get the next ->private from the
4824 *                                     lower neighbour list, RCU
4825 *                                     variant
4826 * @dev: device
4827 * @iter: list_head ** of the current position
4828 *
4829 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4830 * list, starting from iter position. The caller must hold RCU read lock.
4831 */
4832void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4833                                        struct list_head **iter)
4834{
4835        struct netdev_adjacent *lower;
4836
4837        WARN_ON_ONCE(!rcu_read_lock_held());
4838
4839        lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4840
4841        if (&lower->list == &dev->adj_list.lower)
4842                return NULL;
4843
4844        *iter = &lower->list;
4845
4846        return lower->private;
4847}
4848EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4849
4850/**
4851 * netdev_lower_get_next - Get the next device from the lower neighbour
4852 *                         list
4853 * @dev: device
4854 * @iter: list_head ** of the current position
4855 *
4856 * Gets the next netdev_adjacent from the dev's lower neighbour
4857 * list, starting from iter position. The caller must hold RTNL lock or
4858 * its own locking that guarantees that the neighbour lower
4859 * list will remain unchainged.
4860 */
4861void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4862{
4863        struct netdev_adjacent *lower;
4864
4865        lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4866
4867        if (&lower->list == &dev->adj_list.lower)
4868                return NULL;
4869
4870        *iter = &lower->list;
4871
4872        return lower->dev;
4873}
4874EXPORT_SYMBOL(netdev_lower_get_next);
4875
4876/**
4877 * netdev_lower_get_first_private_rcu - Get the first ->private from the
4878 *                                     lower neighbour list, RCU
4879 *                                     variant
4880 * @dev: device
4881 *
4882 * Gets the first netdev_adjacent->private from the dev's lower neighbour
4883 * list. The caller must hold RCU read lock.
4884 */
4885void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4886{
4887        struct netdev_adjacent *lower;
4888
4889        lower = list_first_or_null_rcu(&dev->adj_list.lower,
4890                        struct netdev_adjacent, list);
4891        if (lower)
4892                return lower->private;
4893        return NULL;
4894}
4895EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4896
4897/**
4898 * netdev_master_upper_dev_get_rcu - Get master upper device
4899 * @dev: device
4900 *
4901 * Find a master upper device and return pointer to it or NULL in case
4902 * it's not there. The caller must hold the RCU read lock.
4903 */
4904struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4905{
4906        struct netdev_adjacent *upper;
4907
4908        upper = list_first_or_null_rcu(&dev->adj_list.upper,
4909                                       struct netdev_adjacent, list);
4910        if (upper && likely(upper->master))
4911                return upper->dev;
4912        return NULL;
4913}
4914EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4915
4916static int netdev_adjacent_sysfs_add(struct net_device *dev,
4917                              struct net_device *adj_dev,
4918                              struct list_head *dev_list)
4919{
4920        char linkname[IFNAMSIZ+7];
4921        sprintf(linkname, dev_list == &dev->adj_list.upper ?
4922                "upper_%s" : "lower_%s", adj_dev->name);
4923        return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4924                                 linkname);
4925}
4926static void netdev_adjacent_sysfs_del(struct net_device *dev,
4927                               char *name,
4928                               struct list_head *dev_list)
4929{
4930        char linkname[IFNAMSIZ+7];
4931        sprintf(linkname, dev_list == &dev->adj_list.upper ?
4932                "upper_%s" : "lower_%s", name);
4933        sysfs_remove_link(&(dev->dev.kobj), linkname);
4934}
4935
4936static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
4937                                                 struct net_device *adj_dev,
4938                                                 struct list_head *dev_list)
4939{
4940        return (dev_list == &dev->adj_list.upper ||
4941                dev_list == &dev->adj_list.lower) &&
4942                net_eq(dev_net(dev), dev_net(adj_dev));
4943}
4944
4945static int __netdev_adjacent_dev_insert(struct net_device *dev,
4946                                        struct net_device *adj_dev,
4947                                        struct list_head *dev_list,
4948                                        void *private, bool master)
4949{
4950        struct netdev_adjacent *adj;
4951        int ret;
4952
4953        adj = __netdev_find_adj(dev, adj_dev, dev_list);
4954
4955        if (adj) {
4956                adj->ref_nr++;
4957                return 0;
4958        }
4959
4960        adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4961        if (!adj)
4962                return -ENOMEM;
4963
4964        adj->dev = adj_dev;
4965        adj->master = master;
4966        adj->ref_nr = 1;
4967        adj->private = private;
4968        dev_hold(adj_dev);
4969
4970        pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4971                 adj_dev->name, dev->name, adj_dev->name);
4972
4973        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
4974                ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
4975                if (ret)
4976                        goto free_adj;
4977        }
4978
4979        /* Ensure that master link is always the first item in list. */
4980        if (master) {
4981                ret = sysfs_create_link(&(dev->dev.kobj),
4982                                        &(adj_dev->dev.kobj), "master");
4983                if (ret)
4984                        goto remove_symlinks;
4985
4986                list_add_rcu(&adj->list, dev_list);
4987        } else {
4988                list_add_tail_rcu(&adj->list, dev_list);
4989        }
4990
4991        return 0;
4992
4993remove_symlinks:
4994        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
4995                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4996free_adj:
4997        kfree(adj);
4998        dev_put(adj_dev);
4999
5000        return ret;
5001}
5002
5003static void __netdev_adjacent_dev_remove(struct net_device *dev,
5004                                         struct net_device *adj_dev,
5005                                         struct list_head *dev_list)
5006{
5007        struct netdev_adjacent *adj;
5008
5009        adj = __netdev_find_adj(dev, adj_dev, dev_list);
5010
5011        if (!adj) {
5012                pr_err("tried to remove device %s from %s\n",
5013                       dev->name, adj_dev->name);
5014                BUG();
5015        }
5016
5017        if (adj->ref_nr > 1) {
5018                pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5019                         adj->ref_nr-1);
5020                adj->ref_nr--;
5021                return;
5022        }
5023
5024        if (adj->master)
5025                sysfs_remove_link(&(dev->dev.kobj), "master");
5026
5027        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5028                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5029
5030        list_del_rcu(&adj->list);
5031        pr_debug("dev_put for %s, because link removed from %s to %s\n",
5032                 adj_dev->name, dev->name, adj_dev->name);
5033        dev_put(adj_dev);
5034        kfree_rcu(adj, rcu);
5035}
5036
5037static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5038                                            struct net_device *upper_dev,
5039                                            struct list_head *up_list,
5040                                            struct list_head *down_list,
5041                                            void *private, bool master)
5042{
5043        int ret;
5044
5045        ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5046                                           master);
5047        if (ret)
5048                return ret;
5049
5050        ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5051                                           false);
5052        if (ret) {
5053                __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5054                return ret;
5055        }
5056
5057        return 0;
5058}
5059
5060static int __netdev_adjacent_dev_link(struct net_device *dev,
5061                                      struct net_device *upper_dev)
5062{
5063        return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5064                                                &dev->all_adj_list.upper,
5065                                                &upper_dev->all_adj_list.lower,
5066                                                NULL, false);
5067}
5068
5069static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5070                                               struct net_device *upper_dev,
5071                                               struct list_head *up_list,
5072                                               struct list_head *down_list)
5073{
5074        __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5075        __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5076}
5077
5078static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5079                                         struct net_device *upper_dev)
5080{
5081        __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5082                                           &dev->all_adj_list.upper,
5083                                           &upper_dev->all_adj_list.lower);
5084}
5085
5086static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5087                                                struct net_device *upper_dev,
5088                                                void *private, bool master)
5089{
5090        int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5091
5092        if (ret)
5093                return ret;
5094
5095        ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5096                                               &dev->adj_list.upper,
5097                                               &upper_dev->adj_list.lower,
5098                                               private, master);
5099        if (ret) {
5100                __netdev_adjacent_dev_unlink(dev, upper_dev);
5101                return ret;
5102        }
5103
5104        return 0;
5105}
5106
5107static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5108                                                   struct net_device *upper_dev)
5109{
5110        __netdev_adjacent_dev_unlink(dev, upper_dev);
5111        __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5112                                           &dev->adj_list.upper,
5113                                           &upper_dev->adj_list.lower);
5114}
5115
5116static int __netdev_upper_dev_link(struct net_device *dev,
5117                                   struct net_device *upper_dev, bool master,
5118                                   void *private)
5119{
5120        struct netdev_adjacent *i, *j, *to_i, *to_j;
5121        int ret = 0;
5122
5123        ASSERT_RTNL();
5124
5125        if (dev == upper_dev)
5126                return -EBUSY;
5127
5128        /* To prevent loops, check if dev is not upper device to upper_dev. */
5129        if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5130                return -EBUSY;
5131
5132        if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
5133                return -EEXIST;
5134
5135        if (master && netdev_master_upper_dev_get(dev))
5136                return -EBUSY;
5137
5138        ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5139                                                   master);
5140        if (ret)
5141                return ret;
5142
5143        /* Now that we linked these devs, make all the upper_dev's
5144         * all_adj_list.upper visible to every dev's all_adj_list.lower an
5145         * versa, and don't forget the devices itself. All of these
5146         * links are non-neighbours.
5147         */
5148        list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5149                list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5150                        pr_debug("Interlinking %s with %s, non-neighbour\n",
5151                                 i->dev->name, j->dev->name);
5152                        ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5153                        if (ret)
5154                                goto rollback_mesh;
5155                }
5156        }
5157
5158        /* add dev to every upper_dev's upper device */
5159        list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5160                pr_debug("linking %s's upper device %s with %s\n",
5161                         upper_dev->name, i->dev->name, dev->name);
5162                ret = __netdev_adjacent_dev_link(dev, i->dev);
5163                if (ret)
5164                        goto rollback_upper_mesh;
5165        }
5166
5167        /* add upper_dev to every dev's lower device */
5168        list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5169                pr_debug("linking %s's lower device %s with %s\n", dev->name,
5170                         i->dev->name, upper_dev->name);
5171                ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5172                if (ret)
5173                        goto rollback_lower_mesh;
5174        }
5175
5176        call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5177        return 0;
5178
5179rollback_lower_mesh:
5180        to_i = i;
5181        list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5182                if (i == to_i)
5183                        break;
5184                __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5185        }
5186
5187        i = NULL;
5188
5189rollback_upper_mesh:
5190        to_i = i;
5191        list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5192                if (i == to_i)
5193                        break;
5194                __netdev_adjacent_dev_unlink(dev, i->dev);
5195        }
5196
5197        i = j = NULL;
5198
5199rollback_mesh:
5200        to_i = i;
5201        to_j = j;
5202        list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5203                list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5204                        if (i == to_i && j == to_j)
5205                                break;
5206                        __netdev_adjacent_dev_unlink(i->dev, j->dev);
5207                }
5208                if (i == to_i)
5209                        break;
5210        }
5211
5212        __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5213
5214        return ret;
5215}
5216
5217/**
5218 * netdev_upper_dev_link - Add a link to the upper device
5219 * @dev: device
5220 * @upper_dev: new upper device
5221 *
5222 * Adds a link to device which is upper to this one. The caller must hold
5223 * the RTNL lock. On a failure a negative errno code is returned.
5224 * On success the reference counts are adjusted and the function
5225 * returns zero.
5226 */
5227int netdev_upper_dev_link(struct net_device *dev,
5228                          struct net_device *upper_dev)
5229{
5230        return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5231}
5232EXPORT_SYMBOL(netdev_upper_dev_link);
5233
5234/**
5235 * netdev_master_upper_dev_link - Add a master link to the upper device
5236 * @dev: device
5237 * @upper_dev: new upper device
5238 *
5239 * Adds a link to device which is upper to this one. In this case, only
5240 * one master upper device can be linked, although other non-master devices
5241 * might be linked as well. The caller must hold the RTNL lock.
5242 * On a failure a negative errno code is returned. On success the reference
5243 * counts are adjusted and the function returns zero.
5244 */
5245int netdev_master_upper_dev_link(struct net_device *dev,
5246                                 struct net_device *upper_dev)
5247{
5248        return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5249}
5250EXPORT_SYMBOL(netdev_master_upper_dev_link);
5251
5252int netdev_master_upper_dev_link_private(struct net_device *dev,
5253                                         struct net_device *upper_dev,
5254                                         void *private)
5255{
5256        return __netdev_upper_dev_link(dev, upper_dev, true, private);
5257}
5258EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5259
5260/**
5261 * netdev_upper_dev_unlink - Removes a link to upper device
5262 * @dev: device
5263 * @upper_dev: new upper device
5264 *
5265 * Removes a link to device which is upper to this one. The caller must hold
5266 * the RTNL lock.
5267 */
5268void netdev_upper_dev_unlink(struct net_device *dev,
5269                             struct net_device *upper_dev)
5270{
5271        struct netdev_adjacent *i, *j;
5272        ASSERT_RTNL();
5273
5274        __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5275
5276        /* Here is the tricky part. We must remove all dev's lower
5277         * devices from all upper_dev's upper devices and vice
5278         * versa, to maintain the graph relationship.
5279         */
5280        list_for_each_entry(i, &dev->all_adj_list.lower, list)
5281                list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5282                        __netdev_adjacent_dev_unlink(i->dev, j->dev);
5283
5284        /* remove also the devices itself from lower/upper device
5285         * list
5286         */
5287        list_for_each_entry(i, &dev->all_adj_list.lower, list)
5288                __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5289
5290        list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5291                __netdev_adjacent_dev_unlink(dev, i->dev);
5292
5293        call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5294}
5295EXPORT_SYMBOL(netdev_upper_dev_unlink);
5296
5297static void netdev_adjacent_add_links(struct net_device *dev)
5298{
5299        struct netdev_adjacent *iter;
5300
5301        struct net *net = dev_net(dev);
5302
5303        list_for_each_entry(iter, &dev->adj_list.upper, list) {
5304                if (!net_eq(net,dev_net(iter->dev)))
5305                        continue;
5306                netdev_adjacent_sysfs_add(iter->dev, dev,
5307                                          &iter->dev->adj_list.lower);
5308                netdev_adjacent_sysfs_add(dev, iter->dev,
5309                                          &dev->adj_list.upper);
5310        }
5311
5312        list_for_each_entry(iter, &dev->adj_list.lower, list) {
5313                if (!net_eq(net,dev_net(iter->dev)))
5314                        continue;
5315                netdev_adjacent_sysfs_add(iter->dev, dev,
5316                                          &iter->dev->adj_list.upper);
5317                netdev_adjacent_sysfs_add(dev, iter->dev,
5318                                          &dev->adj_list.lower);
5319        }
5320}
5321
5322static void netdev_adjacent_del_links(struct net_device *dev)
5323{
5324        struct netdev_adjacent *iter;
5325
5326        struct net *net = dev_net(dev);
5327
5328        list_for_each_entry(iter, &dev->adj_list.upper, list) {
5329                if (!net_eq(net,dev_net(iter->dev)))
5330                        continue;
5331                netdev_adjacent_sysfs_del(iter->dev, dev->name,
5332                                          &iter->dev->adj_list.lower);
5333                netdev_adjacent_sysfs_del(dev, iter->dev->name,
5334                                          &dev->adj_list.upper);
5335        }
5336
5337        list_for_each_entry(iter, &dev->adj_list.lower, list) {
5338                if (!net_eq(net,dev_net(iter->dev)))
5339                        continue;
5340                netdev_adjacent_sysfs_del(iter->dev, dev->name,
5341                                          &iter->dev->adj_list.upper);
5342                netdev_adjacent_sysfs_del(dev, iter->dev->name,
5343                                          &dev->adj_list.lower);
5344        }
5345}
5346
5347void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5348{
5349        struct netdev_adjacent *iter;
5350
5351        struct net *net = dev_net(dev);
5352
5353        list_for_each_entry(iter, &dev->adj_list.upper, list) {
5354                if (!net_eq(net,dev_net(iter->dev)))
5355                        continue;
5356                netdev_adjacent_sysfs_del(iter->dev, oldname,
5357                                          &iter->dev->adj_list.lower);
5358                netdev_adjacent_sysfs_add(iter->dev, dev,
5359                                          &iter->dev->adj_list.lower);
5360        }
5361
5362        list_for_each_entry(iter, &dev->adj_list.lower, list) {
5363                if (!net_eq(net,dev_net(iter->dev)))
5364                        continue;
5365                netdev_adjacent_sysfs_del(iter->dev, oldname,
5366                                          &iter->dev->adj_list.upper);
5367                netdev_adjacent_sysfs_add(iter->dev, dev,
5368                                          &iter->dev->adj_list.upper);
5369        }
5370}
5371
5372void *netdev_lower_dev_get_private(struct net_device *dev,
5373                                   struct net_device *lower_dev)
5374{
5375        struct netdev_adjacent *lower;
5376
5377        if (!lower_dev)
5378                return NULL;
5379        lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5380        if (!lower)
5381                return NULL;
5382
5383        return lower->private;
5384}
5385EXPORT_SYMBOL(netdev_lower_dev_get_private);
5386
5387
5388int dev_get_nest_level(struct net_device *dev,
5389                       bool (*type_check)(struct net_device *dev))
5390{
5391        struct net_device *lower = NULL;
5392        struct list_head *iter;
5393        int max_nest = -1;
5394        int nest;
5395
5396        ASSERT_RTNL();
5397
5398        netdev_for_each_lower_dev(dev, lower, iter) {
5399                nest = dev_get_nest_level(lower, type_check);
5400                if (max_nest < nest)
5401                        max_nest = nest;
5402        }
5403
5404        if (type_check(dev))
5405                max_nest++;
5406
5407        return max_nest;
5408}
5409EXPORT_SYMBOL(dev_get_nest_level);
5410
5411static void dev_change_rx_flags(struct net_device *dev, int flags)
5412{
5413        const struct net_device_ops *ops = dev->netdev_ops;
5414
5415        if (ops->ndo_change_rx_flags)
5416                ops->ndo_change_rx_flags(dev, flags);
5417}
5418
5419static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5420{
5421        unsigned int old_flags = dev->flags;
5422        kuid_t uid;
5423        kgid_t gid;
5424
5425        ASSERT_RTNL();
5426
5427        dev->flags |= IFF_PROMISC;
5428        dev->promiscuity += inc;
5429        if (dev->promiscuity == 0) {
5430                /*
5431                 * Avoid overflow.
5432                 * If inc causes overflow, untouch promisc and return error.
5433                 */
5434                if (inc < 0)
5435                        dev->flags &= ~IFF_PROMISC;
5436                else {
5437                        dev->promiscuity -= inc;
5438                        pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5439                                dev->name);
5440                        return -EOVERFLOW;
5441                }
5442        }
5443        if (dev->flags != old_flags) {
5444                pr_info("device %s %s promiscuous mode\n",
5445                        dev->name,
5446                        dev->flags & IFF_PROMISC ? "entered" : "left");
5447                if (audit_enabled) {
5448                        current_uid_gid(&uid, &gid);
5449                        audit_log(current->audit_context, GFP_ATOMIC,
5450                                AUDIT_ANOM_PROMISCUOUS,
5451                                "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5452                                dev->name, (dev->flags & IFF_PROMISC),
5453                                (old_flags & IFF_PROMISC),
5454                                from_kuid(&init_user_ns, audit_get_loginuid(current)),
5455                                from_kuid(&init_user_ns, uid),
5456                                from_kgid(&init_user_ns, gid),
5457                                audit_get_sessionid(current));
5458                }
5459
5460                dev_change_rx_flags(dev, IFF_PROMISC);
5461        }
5462        if (notify)
5463                __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5464        return 0;
5465}
5466
5467/**
5468 *      dev_set_promiscuity     - update promiscuity count on a device
5469 *      @dev: device
5470 *      @inc: modifier
5471 *
5472 *      Add or remove promiscuity from a device. While the count in the device
5473 *      remains above zero the interface remains promiscuous. Once it hits zero
5474 *      the device reverts back to normal filtering operation. A negative inc
5475 *      value is used to drop promiscuity on the device.
5476 *      Return 0 if successful or a negative errno code on error.
5477 */
5478int dev_set_promiscuity(struct net_device *dev, int inc)
5479{
5480        unsigned int old_flags = dev->flags;
5481        int err;
5482
5483        err = __dev_set_promiscuity(dev, inc, true);
5484        if (err < 0)
5485                return err;
5486        if (dev->flags != old_flags)
5487                dev_set_rx_mode(dev);
5488        return err;
5489}
5490EXPORT_SYMBOL(dev_set_promiscuity);
5491
5492static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5493{
5494        unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5495
5496        ASSERT_RTNL();
5497
5498        dev->flags |= IFF_ALLMULTI;
5499        dev->allmulti += inc;
5500        if (dev->allmulti == 0) {
5501                /*
5502                 * Avoid overflow.
5503                 * If inc causes overflow, untouch allmulti and return error.
5504                 */
5505                if (inc < 0)
5506                        dev->flags &= ~IFF_ALLMULTI;
5507                else {
5508                        dev->allmulti -= inc;
5509                        pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5510                                dev->name);
5511                        return -EOVERFLOW;
5512                }
5513        }
5514        if (dev->flags ^ old_flags) {
5515                dev_change_rx_flags(dev, IFF_ALLMULTI);
5516                dev_set_rx_mode(dev);
5517                if (notify)
5518                        __dev_notify_flags(dev, old_flags,
5519                                           dev->gflags ^ old_gflags);
5520        }
5521        return 0;
5522}
5523
5524/**
5525 *      dev_set_allmulti        - update allmulti count on a device
5526 *      @dev: device
5527 *      @inc: modifier
5528 *
5529 *      Add or remove reception of all multicast frames to a device. While the
5530 *      count in the device remains above zero the interface remains listening
5531 *      to all interfaces. Once it hits zero the device reverts back to normal
5532 *      filtering operation. A negative @inc value is used to drop the counter
5533 *      when releasing a resource needing all multicasts.
5534 *      Return 0 if successful or a negative errno code on error.
5535 */
5536
5537int dev_set_allmulti(struct net_device *dev, int inc)
5538{
5539        return __dev_set_allmulti(dev, inc, true);
5540}
5541EXPORT_SYMBOL(dev_set_allmulti);
5542
5543/*
5544 *      Upload unicast and multicast address lists to device and
5545 *      configure RX filtering. When the device doesn't support unicast
5546 *      filtering it is put in promiscuous mode while unicast addresses
5547 *      are present.
5548 */
5549void __dev_set_rx_mode(struct net_device *dev)
5550{
5551        const struct net_device_ops *ops = dev->netdev_ops;
5552
5553        /* dev_open will call this function so the list will stay sane. */
5554        if (!(dev->flags&IFF_UP))
5555                return;
5556
5557        if (!netif_device_present(dev))
5558                return;
5559
5560        if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5561                /* Unicast addresses changes may only happen under the rtnl,
5562                 * therefore calling __dev_set_promiscuity here is safe.
5563                 */
5564                if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5565                        __dev_set_promiscuity(dev, 1, false);
5566                        dev->uc_promisc = true;
5567                } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5568                        __dev_set_promiscuity(dev, -1, false);
5569                        dev->uc_promisc = false;
5570                }
5571        }
5572
5573        if (ops->ndo_set_rx_mode)
5574                ops->ndo_set_rx_mode(dev);
5575}
5576
5577void dev_set_rx_mode(struct net_device *dev)
5578{
5579        netif_addr_lock_bh(dev);
5580        __dev_set_rx_mode(dev);
5581        netif_addr_unlock_bh(dev);
5582}
5583
5584/**
5585 *      dev_get_flags - get flags reported to userspace
5586 *      @dev: device
5587 *
5588 *      Get the combination of flag bits exported through APIs to userspace.
5589 */
5590unsigned int dev_get_flags(const struct net_device *dev)
5591{
5592        unsigned int flags;
5593
5594        flags = (dev->flags & ~(IFF_PROMISC |
5595                                IFF_ALLMULTI |
5596                                IFF_RUNNING |
5597                                IFF_LOWER_UP |
5598                                IFF_DORMANT)) |
5599                (dev->gflags & (IFF_PROMISC |
5600                                IFF_ALLMULTI));
5601
5602        if (netif_running(dev)) {
5603                if (netif_oper_up(dev))
5604                        flags |= IFF_RUNNING;
5605                if (netif_carrier_ok(dev))
5606                        flags |= IFF_LOWER_UP;
5607                if (netif_dormant(dev))
5608                        flags |= IFF_DORMANT;
5609        }
5610
5611        return flags;
5612}
5613EXPORT_SYMBOL(dev_get_flags);
5614
5615int __dev_change_flags(struct net_device *dev, unsigned int flags)
5616{
5617        unsigned int old_flags = dev->flags;
5618        int ret;
5619
5620        ASSERT_RTNL();
5621
5622        /*
5623         *      Set the flags on our device.
5624         */
5625
5626        dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5627                               IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5628                               IFF_AUTOMEDIA)) |
5629                     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5630                                    IFF_ALLMULTI));
5631
5632        /*
5633         *      Load in the correct multicast list now the flags have changed.
5634         */
5635
5636        if ((old_flags ^ flags) & IFF_MULTICAST)
5637                dev_change_rx_flags(dev, IFF_MULTICAST);
5638
5639        dev_set_rx_mode(dev);
5640
5641        /*
5642         *      Have we downed the interface. We handle IFF_UP ourselves
5643         *      according to user attempts to set it, rather than blindly
5644         *      setting it.
5645         */
5646
5647        ret = 0;
5648        if ((old_flags ^ flags) & IFF_UP)
5649                ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5650
5651        if ((flags ^ dev->gflags) & IFF_PROMISC) {
5652                int inc = (flags & IFF_PROMISC) ? 1 : -1;
5653                unsigned int old_flags = dev->flags;
5654
5655                dev->gflags ^= IFF_PROMISC;
5656
5657                if (__dev_set_promiscuity(dev, inc, false) >= 0)
5658                        if (dev->flags != old_flags)
5659                                dev_set_rx_mode(dev);
5660        }
5661
5662        /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5663           is important. Some (broken) drivers set IFF_PROMISC, when
5664           IFF_ALLMULTI is requested not asking us and not reporting.
5665         */
5666        if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5667                int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5668
5669                dev->gflags ^= IFF_ALLMULTI;
5670                __dev_set_allmulti(dev, inc, false);
5671        }
5672
5673        return ret;
5674}
5675
5676void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5677                        unsigned int gchanges)
5678{
5679        unsigned int changes = dev->flags ^ old_flags;
5680
5681        if (gchanges)
5682                rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5683
5684        if (changes & IFF_UP) {
5685                if (dev->flags & IFF_UP)
5686                        call_netdevice_notifiers(NETDEV_UP, dev);
5687                else
5688                        call_netdevice_notifiers(NETDEV_DOWN, dev);
5689        }
5690
5691        if (dev->flags & IFF_UP &&
5692            (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5693                struct netdev_notifier_change_info change_info;
5694
5695                change_info.flags_changed = changes;
5696                call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5697                                              &change_info.info);
5698        }
5699}
5700
5701/**
5702 *      dev_change_flags - change device settings
5703 *      @dev: device
5704 *      @flags: device state flags
5705 *
5706 *      Change settings on device based state flags. The flags are
5707 *      in the userspace exported format.
5708 */
5709int dev_change_flags(struct net_device *dev, unsigned int flags)
5710{
5711        int ret;
5712        unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5713
5714        ret = __dev_change_flags(dev, flags);
5715        if (ret < 0)
5716                return ret;
5717
5718        changes = (old_flags ^ dev->flags</