linux/net/core/dev.c
<<
>>
Prefs
   1/*
   2 *      NET3    Protocol independent device support routines.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 *      Derived from the non IP parts of dev.c 1.0.19
  10 *              Authors:        Ross Biro
  11 *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *      Additional Authors:
  15 *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *              David Hinds <dahinds@users.sourceforge.net>
  18 *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *              Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *      Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *                                      to 2 if register_netdev gets called
  25 *                                      before net_dev_init & also removed a
  26 *                                      few lines of code in the process.
  27 *              Alan Cox        :       device private ioctl copies fields back.
  28 *              Alan Cox        :       Transmit queue code does relevant
  29 *                                      stunts to keep the queue safe.
  30 *              Alan Cox        :       Fixed double lock.
  31 *              Alan Cox        :       Fixed promisc NULL pointer trap
  32 *              ????????        :       Support the full private ioctl range
  33 *              Alan Cox        :       Moved ioctl permission check into
  34 *                                      drivers
  35 *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36 *              Alan Cox        :       100 backlog just doesn't cut it when
  37 *                                      you start doing multicast video 8)
  38 *              Alan Cox        :       Rewrote net_bh and list manager.
  39 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40 *              Alan Cox        :       Took out transmit every packet pass
  41 *                                      Saved a few bytes in the ioctl handler
  42 *              Alan Cox        :       Network driver sets packet type before
  43 *                                      calling netif_rx. Saves a function
  44 *                                      call a packet.
  45 *              Alan Cox        :       Hashed net_bh()
  46 *              Richard Kooijman:       Timestamp fixes.
  47 *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48 *              Alan Cox        :       Device lock protection.
  49 *              Alan Cox        :       Fixed nasty side effect of device close
  50 *                                      changes.
  51 *              Rudi Cilibrasi  :       Pass the right thing to
  52 *                                      set_mac_address()
  53 *              Dave Miller     :       32bit quantity for the device lock to
  54 *                                      make it work out on a Sparc.
  55 *              Bjorn Ekwall    :       Added KERNELD hack.
  56 *              Alan Cox        :       Cleaned up the backlog initialise.
  57 *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58 *                                      1 device.
  59 *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60 *                                      is no device open function.
  61 *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62 *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63 *              Cyrus Durgin    :       Cleaned for KMOD
  64 *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65 *                                      A network device unload needs to purge
  66 *                                      the backlog queue.
  67 *      Paul Rusty Russell      :       SIOCSIFNAME
  68 *              Pekka Riikonen  :       Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *                                      indefinitely on dev->refcnt
  71 *              J Hadi Salim    :       - Backlog queue sampling
  72 *                                      - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
  84#include <linux/mutex.h>
  85#include <linux/string.h>
  86#include <linux/mm.h>
  87#include <linux/socket.h>
  88#include <linux/sockios.h>
  89#include <linux/errno.h>
  90#include <linux/interrupt.h>
  91#include <linux/if_ether.h>
  92#include <linux/netdevice.h>
  93#include <linux/etherdevice.h>
  94#include <linux/ethtool.h>
  95#include <linux/notifier.h>
  96#include <linux/skbuff.h>
  97#include <net/net_namespace.h>
  98#include <net/sock.h>
  99#include <linux/rtnetlink.h>
 100#include <linux/proc_fs.h>
 101#include <linux/seq_file.h>
 102#include <linux/stat.h>
 103#include <net/dst.h>
 104#include <net/pkt_sched.h>
 105#include <net/checksum.h>
 106#include <net/xfrm.h>
 107#include <linux/highmem.h>
 108#include <linux/init.h>
 109#include <linux/kmod.h>
 110#include <linux/module.h>
 111#include <linux/netpoll.h>
 112#include <linux/rcupdate.h>
 113#include <linux/delay.h>
 114#include <net/wext.h>
 115#include <net/iw_handler.h>
 116#include <asm/current.h>
 117#include <linux/audit.h>
 118#include <linux/dmaengine.h>
 119#include <linux/err.h>
 120#include <linux/ctype.h>
 121#include <linux/if_arp.h>
 122#include <linux/if_vlan.h>
 123#include <linux/ip.h>
 124#include <net/ip.h>
 125#include <linux/ipv6.h>
 126#include <linux/in.h>
 127#include <linux/jhash.h>
 128#include <linux/random.h>
 129#include <trace/events/napi.h>
 130#include <trace/events/net.h>
 131#include <trace/events/skb.h>
 132#include <linux/pci.h>
 133#include <linux/inetdevice.h>
 134#include <linux/cpu_rmap.h>
 135#include <linux/net_tstamp.h>
 136#include <linux/static_key.h>
 137#include <net/flow_keys.h>
 138
 139#include "net-sysfs.h"
 140
 141/* Instead of increasing this, you should create a hash table. */
 142#define MAX_GRO_SKBS 8
 143
 144/* This should be increased if a protocol with a bigger head is added. */
 145#define GRO_MAX_HEAD (MAX_HEADER + 128)
 146
 147/*
 148 *      The list of packet types we will receive (as opposed to discard)
 149 *      and the routines to invoke.
 150 *
 151 *      Why 16. Because with 16 the only overlap we get on a hash of the
 152 *      low nibble of the protocol value is RARP/SNAP/X.25.
 153 *
 154 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 155 *             sure which should go first, but I bet it won't make much
 156 *             difference if we are running VLANs.  The good news is that
 157 *             this protocol won't be in the list unless compiled in, so
 158 *             the average user (w/out VLANs) will not be adversely affected.
 159 *             --BLG
 160 *
 161 *              0800    IP
 162 *              8100    802.1Q VLAN
 163 *              0001    802.3
 164 *              0002    AX.25
 165 *              0004    802.2
 166 *              8035    RARP
 167 *              0005    SNAP
 168 *              0805    X.25
 169 *              0806    ARP
 170 *              8137    IPX
 171 *              0009    Localtalk
 172 *              86DD    IPv6
 173 */
 174
 175#define PTYPE_HASH_SIZE (16)
 176#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 177
 178static DEFINE_SPINLOCK(ptype_lock);
 179static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 180static struct list_head ptype_all __read_mostly;        /* Taps */
 181
 182/*
 183 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 184 * semaphore.
 185 *
 186 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 187 *
 188 * Writers must hold the rtnl semaphore while they loop through the
 189 * dev_base_head list, and hold dev_base_lock for writing when they do the
 190 * actual updates.  This allows pure readers to access the list even
 191 * while a writer is preparing to update it.
 192 *
 193 * To put it another way, dev_base_lock is held for writing only to
 194 * protect against pure readers; the rtnl semaphore provides the
 195 * protection against other writers.
 196 *
 197 * See, for example usages, register_netdevice() and
 198 * unregister_netdevice(), which must be called with the rtnl
 199 * semaphore held.
 200 */
 201DEFINE_RWLOCK(dev_base_lock);
 202EXPORT_SYMBOL(dev_base_lock);
 203
 204static inline void dev_base_seq_inc(struct net *net)
 205{
 206        while (++net->dev_base_seq == 0);
 207}
 208
 209static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 210{
 211        unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 212
 213        return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 214}
 215
 216static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 217{
 218        return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 219}
 220
 221static inline void rps_lock(struct softnet_data *sd)
 222{
 223#ifdef CONFIG_RPS
 224        spin_lock(&sd->input_pkt_queue.lock);
 225#endif
 226}
 227
 228static inline void rps_unlock(struct softnet_data *sd)
 229{
 230#ifdef CONFIG_RPS
 231        spin_unlock(&sd->input_pkt_queue.lock);
 232#endif
 233}
 234
 235/* Device list insertion */
 236static int list_netdevice(struct net_device *dev)
 237{
 238        struct net *net = dev_net(dev);
 239
 240        ASSERT_RTNL();
 241
 242        write_lock_bh(&dev_base_lock);
 243        list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 244        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 245        hlist_add_head_rcu(&dev->index_hlist,
 246                           dev_index_hash(net, dev->ifindex));
 247        write_unlock_bh(&dev_base_lock);
 248
 249        dev_base_seq_inc(net);
 250
 251        return 0;
 252}
 253
 254/* Device list removal
 255 * caller must respect a RCU grace period before freeing/reusing dev
 256 */
 257static void unlist_netdevice(struct net_device *dev)
 258{
 259        ASSERT_RTNL();
 260
 261        /* Unlink dev from the device chain */
 262        write_lock_bh(&dev_base_lock);
 263        list_del_rcu(&dev->dev_list);
 264        hlist_del_rcu(&dev->name_hlist);
 265        hlist_del_rcu(&dev->index_hlist);
 266        write_unlock_bh(&dev_base_lock);
 267
 268        dev_base_seq_inc(dev_net(dev));
 269}
 270
 271/*
 272 *      Our notifier list
 273 */
 274
 275static RAW_NOTIFIER_HEAD(netdev_chain);
 276
 277/*
 278 *      Device drivers call our routines to queue packets here. We empty the
 279 *      queue in the local softnet handler.
 280 */
 281
 282DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 283EXPORT_PER_CPU_SYMBOL(softnet_data);
 284
 285#ifdef CONFIG_LOCKDEP
 286/*
 287 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 288 * according to dev->type
 289 */
 290static const unsigned short netdev_lock_type[] =
 291        {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 292         ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 293         ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 294         ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 295         ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 296         ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 297         ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 298         ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 299         ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 300         ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 301         ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 302         ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 303         ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 304         ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 305         ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 306
 307static const char *const netdev_lock_name[] =
 308        {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 309         "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 310         "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 311         "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 312         "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 313         "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 314         "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 315         "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 316         "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 317         "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 318         "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 319         "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 320         "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 321         "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 322         "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 323
 324static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 325static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 326
 327static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 328{
 329        int i;
 330
 331        for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 332                if (netdev_lock_type[i] == dev_type)
 333                        return i;
 334        /* the last key is used by default */
 335        return ARRAY_SIZE(netdev_lock_type) - 1;
 336}
 337
 338static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 339                                                 unsigned short dev_type)
 340{
 341        int i;
 342
 343        i = netdev_lock_pos(dev_type);
 344        lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 345                                   netdev_lock_name[i]);
 346}
 347
 348static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 349{
 350        int i;
 351
 352        i = netdev_lock_pos(dev->type);
 353        lockdep_set_class_and_name(&dev->addr_list_lock,
 354                                   &netdev_addr_lock_key[i],
 355                                   netdev_lock_name[i]);
 356}
 357#else
 358static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 359                                                 unsigned short dev_type)
 360{
 361}
 362static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 363{
 364}
 365#endif
 366
 367/*******************************************************************************
 368
 369                Protocol management and registration routines
 370
 371*******************************************************************************/
 372
 373/*
 374 *      Add a protocol ID to the list. Now that the input handler is
 375 *      smarter we can dispense with all the messy stuff that used to be
 376 *      here.
 377 *
 378 *      BEWARE!!! Protocol handlers, mangling input packets,
 379 *      MUST BE last in hash buckets and checking protocol handlers
 380 *      MUST start from promiscuous ptype_all chain in net_bh.
 381 *      It is true now, do not change it.
 382 *      Explanation follows: if protocol handler, mangling packet, will
 383 *      be the first on list, it is not able to sense, that packet
 384 *      is cloned and should be copied-on-write, so that it will
 385 *      change it and subsequent readers will get broken packet.
 386 *                                                      --ANK (980803)
 387 */
 388
 389static inline struct list_head *ptype_head(const struct packet_type *pt)
 390{
 391        if (pt->type == htons(ETH_P_ALL))
 392                return &ptype_all;
 393        else
 394                return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 395}
 396
 397/**
 398 *      dev_add_pack - add packet handler
 399 *      @pt: packet type declaration
 400 *
 401 *      Add a protocol handler to the networking stack. The passed &packet_type
 402 *      is linked into kernel lists and may not be freed until it has been
 403 *      removed from the kernel lists.
 404 *
 405 *      This call does not sleep therefore it can not
 406 *      guarantee all CPU's that are in middle of receiving packets
 407 *      will see the new packet type (until the next received packet).
 408 */
 409
 410void dev_add_pack(struct packet_type *pt)
 411{
 412        struct list_head *head = ptype_head(pt);
 413
 414        spin_lock(&ptype_lock);
 415        list_add_rcu(&pt->list, head);
 416        spin_unlock(&ptype_lock);
 417}
 418EXPORT_SYMBOL(dev_add_pack);
 419
 420/**
 421 *      __dev_remove_pack        - remove packet handler
 422 *      @pt: packet type declaration
 423 *
 424 *      Remove a protocol handler that was previously added to the kernel
 425 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 426 *      from the kernel lists and can be freed or reused once this function
 427 *      returns.
 428 *
 429 *      The packet type might still be in use by receivers
 430 *      and must not be freed until after all the CPU's have gone
 431 *      through a quiescent state.
 432 */
 433void __dev_remove_pack(struct packet_type *pt)
 434{
 435        struct list_head *head = ptype_head(pt);
 436        struct packet_type *pt1;
 437
 438        spin_lock(&ptype_lock);
 439
 440        list_for_each_entry(pt1, head, list) {
 441                if (pt == pt1) {
 442                        list_del_rcu(&pt->list);
 443                        goto out;
 444                }
 445        }
 446
 447        pr_warn("dev_remove_pack: %p not found\n", pt);
 448out:
 449        spin_unlock(&ptype_lock);
 450}
 451EXPORT_SYMBOL(__dev_remove_pack);
 452
 453/**
 454 *      dev_remove_pack  - remove packet handler
 455 *      @pt: packet type declaration
 456 *
 457 *      Remove a protocol handler that was previously added to the kernel
 458 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 459 *      from the kernel lists and can be freed or reused once this function
 460 *      returns.
 461 *
 462 *      This call sleeps to guarantee that no CPU is looking at the packet
 463 *      type after return.
 464 */
 465void dev_remove_pack(struct packet_type *pt)
 466{
 467        __dev_remove_pack(pt);
 468
 469        synchronize_net();
 470}
 471EXPORT_SYMBOL(dev_remove_pack);
 472
 473/******************************************************************************
 474
 475                      Device Boot-time Settings Routines
 476
 477*******************************************************************************/
 478
 479/* Boot time configuration table */
 480static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 481
 482/**
 483 *      netdev_boot_setup_add   - add new setup entry
 484 *      @name: name of the device
 485 *      @map: configured settings for the device
 486 *
 487 *      Adds new setup entry to the dev_boot_setup list.  The function
 488 *      returns 0 on error and 1 on success.  This is a generic routine to
 489 *      all netdevices.
 490 */
 491static int netdev_boot_setup_add(char *name, struct ifmap *map)
 492{
 493        struct netdev_boot_setup *s;
 494        int i;
 495
 496        s = dev_boot_setup;
 497        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 498                if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 499                        memset(s[i].name, 0, sizeof(s[i].name));
 500                        strlcpy(s[i].name, name, IFNAMSIZ);
 501                        memcpy(&s[i].map, map, sizeof(s[i].map));
 502                        break;
 503                }
 504        }
 505
 506        return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 507}
 508
 509/**
 510 *      netdev_boot_setup_check - check boot time settings
 511 *      @dev: the netdevice
 512 *
 513 *      Check boot time settings for the device.
 514 *      The found settings are set for the device to be used
 515 *      later in the device probing.
 516 *      Returns 0 if no settings found, 1 if they are.
 517 */
 518int netdev_boot_setup_check(struct net_device *dev)
 519{
 520        struct netdev_boot_setup *s = dev_boot_setup;
 521        int i;
 522
 523        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 524                if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 525                    !strcmp(dev->name, s[i].name)) {
 526                        dev->irq        = s[i].map.irq;
 527                        dev->base_addr  = s[i].map.base_addr;
 528                        dev->mem_start  = s[i].map.mem_start;
 529                        dev->mem_end    = s[i].map.mem_end;
 530                        return 1;
 531                }
 532        }
 533        return 0;
 534}
 535EXPORT_SYMBOL(netdev_boot_setup_check);
 536
 537
 538/**
 539 *      netdev_boot_base        - get address from boot time settings
 540 *      @prefix: prefix for network device
 541 *      @unit: id for network device
 542 *
 543 *      Check boot time settings for the base address of device.
 544 *      The found settings are set for the device to be used
 545 *      later in the device probing.
 546 *      Returns 0 if no settings found.
 547 */
 548unsigned long netdev_boot_base(const char *prefix, int unit)
 549{
 550        const struct netdev_boot_setup *s = dev_boot_setup;
 551        char name[IFNAMSIZ];
 552        int i;
 553
 554        sprintf(name, "%s%d", prefix, unit);
 555
 556        /*
 557         * If device already registered then return base of 1
 558         * to indicate not to probe for this interface
 559         */
 560        if (__dev_get_by_name(&init_net, name))
 561                return 1;
 562
 563        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 564                if (!strcmp(name, s[i].name))
 565                        return s[i].map.base_addr;
 566        return 0;
 567}
 568
 569/*
 570 * Saves at boot time configured settings for any netdevice.
 571 */
 572int __init netdev_boot_setup(char *str)
 573{
 574        int ints[5];
 575        struct ifmap map;
 576
 577        str = get_options(str, ARRAY_SIZE(ints), ints);
 578        if (!str || !*str)
 579                return 0;
 580
 581        /* Save settings */
 582        memset(&map, 0, sizeof(map));
 583        if (ints[0] > 0)
 584                map.irq = ints[1];
 585        if (ints[0] > 1)
 586                map.base_addr = ints[2];
 587        if (ints[0] > 2)
 588                map.mem_start = ints[3];
 589        if (ints[0] > 3)
 590                map.mem_end = ints[4];
 591
 592        /* Add new entry to the list */
 593        return netdev_boot_setup_add(str, &map);
 594}
 595
 596__setup("netdev=", netdev_boot_setup);
 597
 598/*******************************************************************************
 599
 600                            Device Interface Subroutines
 601
 602*******************************************************************************/
 603
 604/**
 605 *      __dev_get_by_name       - find a device by its name
 606 *      @net: the applicable net namespace
 607 *      @name: name to find
 608 *
 609 *      Find an interface by name. Must be called under RTNL semaphore
 610 *      or @dev_base_lock. If the name is found a pointer to the device
 611 *      is returned. If the name is not found then %NULL is returned. The
 612 *      reference counters are not incremented so the caller must be
 613 *      careful with locks.
 614 */
 615
 616struct net_device *__dev_get_by_name(struct net *net, const char *name)
 617{
 618        struct hlist_node *p;
 619        struct net_device *dev;
 620        struct hlist_head *head = dev_name_hash(net, name);
 621
 622        hlist_for_each_entry(dev, p, head, name_hlist)
 623                if (!strncmp(dev->name, name, IFNAMSIZ))
 624                        return dev;
 625
 626        return NULL;
 627}
 628EXPORT_SYMBOL(__dev_get_by_name);
 629
 630/**
 631 *      dev_get_by_name_rcu     - find a device by its name
 632 *      @net: the applicable net namespace
 633 *      @name: name to find
 634 *
 635 *      Find an interface by name.
 636 *      If the name is found a pointer to the device is returned.
 637 *      If the name is not found then %NULL is returned.
 638 *      The reference counters are not incremented so the caller must be
 639 *      careful with locks. The caller must hold RCU lock.
 640 */
 641
 642struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 643{
 644        struct hlist_node *p;
 645        struct net_device *dev;
 646        struct hlist_head *head = dev_name_hash(net, name);
 647
 648        hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 649                if (!strncmp(dev->name, name, IFNAMSIZ))
 650                        return dev;
 651
 652        return NULL;
 653}
 654EXPORT_SYMBOL(dev_get_by_name_rcu);
 655
 656/**
 657 *      dev_get_by_name         - find a device by its name
 658 *      @net: the applicable net namespace
 659 *      @name: name to find
 660 *
 661 *      Find an interface by name. This can be called from any
 662 *      context and does its own locking. The returned handle has
 663 *      the usage count incremented and the caller must use dev_put() to
 664 *      release it when it is no longer needed. %NULL is returned if no
 665 *      matching device is found.
 666 */
 667
 668struct net_device *dev_get_by_name(struct net *net, const char *name)
 669{
 670        struct net_device *dev;
 671
 672        rcu_read_lock();
 673        dev = dev_get_by_name_rcu(net, name);
 674        if (dev)
 675                dev_hold(dev);
 676        rcu_read_unlock();
 677        return dev;
 678}
 679EXPORT_SYMBOL(dev_get_by_name);
 680
 681/**
 682 *      __dev_get_by_index - find a device by its ifindex
 683 *      @net: the applicable net namespace
 684 *      @ifindex: index of device
 685 *
 686 *      Search for an interface by index. Returns %NULL if the device
 687 *      is not found or a pointer to the device. The device has not
 688 *      had its reference counter increased so the caller must be careful
 689 *      about locking. The caller must hold either the RTNL semaphore
 690 *      or @dev_base_lock.
 691 */
 692
 693struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 694{
 695        struct hlist_node *p;
 696        struct net_device *dev;
 697        struct hlist_head *head = dev_index_hash(net, ifindex);
 698
 699        hlist_for_each_entry(dev, p, head, index_hlist)
 700                if (dev->ifindex == ifindex)
 701                        return dev;
 702
 703        return NULL;
 704}
 705EXPORT_SYMBOL(__dev_get_by_index);
 706
 707/**
 708 *      dev_get_by_index_rcu - find a device by its ifindex
 709 *      @net: the applicable net namespace
 710 *      @ifindex: index of device
 711 *
 712 *      Search for an interface by index. Returns %NULL if the device
 713 *      is not found or a pointer to the device. The device has not
 714 *      had its reference counter increased so the caller must be careful
 715 *      about locking. The caller must hold RCU lock.
 716 */
 717
 718struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 719{
 720        struct hlist_node *p;
 721        struct net_device *dev;
 722        struct hlist_head *head = dev_index_hash(net, ifindex);
 723
 724        hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 725                if (dev->ifindex == ifindex)
 726                        return dev;
 727
 728        return NULL;
 729}
 730EXPORT_SYMBOL(dev_get_by_index_rcu);
 731
 732
 733/**
 734 *      dev_get_by_index - find a device by its ifindex
 735 *      @net: the applicable net namespace
 736 *      @ifindex: index of device
 737 *
 738 *      Search for an interface by index. Returns NULL if the device
 739 *      is not found or a pointer to the device. The device returned has
 740 *      had a reference added and the pointer is safe until the user calls
 741 *      dev_put to indicate they have finished with it.
 742 */
 743
 744struct net_device *dev_get_by_index(struct net *net, int ifindex)
 745{
 746        struct net_device *dev;
 747
 748        rcu_read_lock();
 749        dev = dev_get_by_index_rcu(net, ifindex);
 750        if (dev)
 751                dev_hold(dev);
 752        rcu_read_unlock();
 753        return dev;
 754}
 755EXPORT_SYMBOL(dev_get_by_index);
 756
 757/**
 758 *      dev_getbyhwaddr_rcu - find a device by its hardware address
 759 *      @net: the applicable net namespace
 760 *      @type: media type of device
 761 *      @ha: hardware address
 762 *
 763 *      Search for an interface by MAC address. Returns NULL if the device
 764 *      is not found or a pointer to the device.
 765 *      The caller must hold RCU or RTNL.
 766 *      The returned device has not had its ref count increased
 767 *      and the caller must therefore be careful about locking
 768 *
 769 */
 770
 771struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 772                                       const char *ha)
 773{
 774        struct net_device *dev;
 775
 776        for_each_netdev_rcu(net, dev)
 777                if (dev->type == type &&
 778                    !memcmp(dev->dev_addr, ha, dev->addr_len))
 779                        return dev;
 780
 781        return NULL;
 782}
 783EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 784
 785struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 786{
 787        struct net_device *dev;
 788
 789        ASSERT_RTNL();
 790        for_each_netdev(net, dev)
 791                if (dev->type == type)
 792                        return dev;
 793
 794        return NULL;
 795}
 796EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 797
 798struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 799{
 800        struct net_device *dev, *ret = NULL;
 801
 802        rcu_read_lock();
 803        for_each_netdev_rcu(net, dev)
 804                if (dev->type == type) {
 805                        dev_hold(dev);
 806                        ret = dev;
 807                        break;
 808                }
 809        rcu_read_unlock();
 810        return ret;
 811}
 812EXPORT_SYMBOL(dev_getfirstbyhwtype);
 813
 814/**
 815 *      dev_get_by_flags_rcu - find any device with given flags
 816 *      @net: the applicable net namespace
 817 *      @if_flags: IFF_* values
 818 *      @mask: bitmask of bits in if_flags to check
 819 *
 820 *      Search for any interface with the given flags. Returns NULL if a device
 821 *      is not found or a pointer to the device. Must be called inside
 822 *      rcu_read_lock(), and result refcount is unchanged.
 823 */
 824
 825struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 826                                    unsigned short mask)
 827{
 828        struct net_device *dev, *ret;
 829
 830        ret = NULL;
 831        for_each_netdev_rcu(net, dev) {
 832                if (((dev->flags ^ if_flags) & mask) == 0) {
 833                        ret = dev;
 834                        break;
 835                }
 836        }
 837        return ret;
 838}
 839EXPORT_SYMBOL(dev_get_by_flags_rcu);
 840
 841/**
 842 *      dev_valid_name - check if name is okay for network device
 843 *      @name: name string
 844 *
 845 *      Network device names need to be valid file names to
 846 *      to allow sysfs to work.  We also disallow any kind of
 847 *      whitespace.
 848 */
 849bool dev_valid_name(const char *name)
 850{
 851        if (*name == '\0')
 852                return false;
 853        if (strlen(name) >= IFNAMSIZ)
 854                return false;
 855        if (!strcmp(name, ".") || !strcmp(name, ".."))
 856                return false;
 857
 858        while (*name) {
 859                if (*name == '/' || isspace(*name))
 860                        return false;
 861                name++;
 862        }
 863        return true;
 864}
 865EXPORT_SYMBOL(dev_valid_name);
 866
 867/**
 868 *      __dev_alloc_name - allocate a name for a device
 869 *      @net: network namespace to allocate the device name in
 870 *      @name: name format string
 871 *      @buf:  scratch buffer and result name string
 872 *
 873 *      Passed a format string - eg "lt%d" it will try and find a suitable
 874 *      id. It scans list of devices to build up a free map, then chooses
 875 *      the first empty slot. The caller must hold the dev_base or rtnl lock
 876 *      while allocating the name and adding the device in order to avoid
 877 *      duplicates.
 878 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 879 *      Returns the number of the unit assigned or a negative errno code.
 880 */
 881
 882static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 883{
 884        int i = 0;
 885        const char *p;
 886        const int max_netdevices = 8*PAGE_SIZE;
 887        unsigned long *inuse;
 888        struct net_device *d;
 889
 890        p = strnchr(name, IFNAMSIZ-1, '%');
 891        if (p) {
 892                /*
 893                 * Verify the string as this thing may have come from
 894                 * the user.  There must be either one "%d" and no other "%"
 895                 * characters.
 896                 */
 897                if (p[1] != 'd' || strchr(p + 2, '%'))
 898                        return -EINVAL;
 899
 900                /* Use one page as a bit array of possible slots */
 901                inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 902                if (!inuse)
 903                        return -ENOMEM;
 904
 905                for_each_netdev(net, d) {
 906                        if (!sscanf(d->name, name, &i))
 907                                continue;
 908                        if (i < 0 || i >= max_netdevices)
 909                                continue;
 910
 911                        /*  avoid cases where sscanf is not exact inverse of printf */
 912                        snprintf(buf, IFNAMSIZ, name, i);
 913                        if (!strncmp(buf, d->name, IFNAMSIZ))
 914                                set_bit(i, inuse);
 915                }
 916
 917                i = find_first_zero_bit(inuse, max_netdevices);
 918                free_page((unsigned long) inuse);
 919        }
 920
 921        if (buf != name)
 922                snprintf(buf, IFNAMSIZ, name, i);
 923        if (!__dev_get_by_name(net, buf))
 924                return i;
 925
 926        /* It is possible to run out of possible slots
 927         * when the name is long and there isn't enough space left
 928         * for the digits, or if all bits are used.
 929         */
 930        return -ENFILE;
 931}
 932
 933/**
 934 *      dev_alloc_name - allocate a name for a device
 935 *      @dev: device
 936 *      @name: name format string
 937 *
 938 *      Passed a format string - eg "lt%d" it will try and find a suitable
 939 *      id. It scans list of devices to build up a free map, then chooses
 940 *      the first empty slot. The caller must hold the dev_base or rtnl lock
 941 *      while allocating the name and adding the device in order to avoid
 942 *      duplicates.
 943 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 944 *      Returns the number of the unit assigned or a negative errno code.
 945 */
 946
 947int dev_alloc_name(struct net_device *dev, const char *name)
 948{
 949        char buf[IFNAMSIZ];
 950        struct net *net;
 951        int ret;
 952
 953        BUG_ON(!dev_net(dev));
 954        net = dev_net(dev);
 955        ret = __dev_alloc_name(net, name, buf);
 956        if (ret >= 0)
 957                strlcpy(dev->name, buf, IFNAMSIZ);
 958        return ret;
 959}
 960EXPORT_SYMBOL(dev_alloc_name);
 961
 962static int dev_alloc_name_ns(struct net *net,
 963                             struct net_device *dev,
 964                             const char *name)
 965{
 966        char buf[IFNAMSIZ];
 967        int ret;
 968
 969        ret = __dev_alloc_name(net, name, buf);
 970        if (ret >= 0)
 971                strlcpy(dev->name, buf, IFNAMSIZ);
 972        return ret;
 973}
 974
 975static int dev_get_valid_name(struct net *net,
 976                              struct net_device *dev,
 977                              const char *name)
 978{
 979        BUG_ON(!net);
 980
 981        if (!dev_valid_name(name))
 982                return -EINVAL;
 983
 984        if (strchr(name, '%'))
 985                return dev_alloc_name_ns(net, dev, name);
 986        else if (__dev_get_by_name(net, name))
 987                return -EEXIST;
 988        else if (dev->name != name)
 989                strlcpy(dev->name, name, IFNAMSIZ);
 990
 991        return 0;
 992}
 993
 994/**
 995 *      dev_change_name - change name of a device
 996 *      @dev: device
 997 *      @newname: name (or format string) must be at least IFNAMSIZ
 998 *
 999 *      Change name of a device, can pass format strings "eth%d".
1000 *      for wildcarding.
1001 */
1002int dev_change_name(struct net_device *dev, const char *newname)
1003{
1004        char oldname[IFNAMSIZ];
1005        int err = 0;
1006        int ret;
1007        struct net *net;
1008
1009        ASSERT_RTNL();
1010        BUG_ON(!dev_net(dev));
1011
1012        net = dev_net(dev);
1013        if (dev->flags & IFF_UP)
1014                return -EBUSY;
1015
1016        if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1017                return 0;
1018
1019        memcpy(oldname, dev->name, IFNAMSIZ);
1020
1021        err = dev_get_valid_name(net, dev, newname);
1022        if (err < 0)
1023                return err;
1024
1025rollback:
1026        ret = device_rename(&dev->dev, dev->name);
1027        if (ret) {
1028                memcpy(dev->name, oldname, IFNAMSIZ);
1029                return ret;
1030        }
1031
1032        write_lock_bh(&dev_base_lock);
1033        hlist_del_rcu(&dev->name_hlist);
1034        write_unlock_bh(&dev_base_lock);
1035
1036        synchronize_rcu();
1037
1038        write_lock_bh(&dev_base_lock);
1039        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1040        write_unlock_bh(&dev_base_lock);
1041
1042        ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1043        ret = notifier_to_errno(ret);
1044
1045        if (ret) {
1046                /* err >= 0 after dev_alloc_name() or stores the first errno */
1047                if (err >= 0) {
1048                        err = ret;
1049                        memcpy(dev->name, oldname, IFNAMSIZ);
1050                        goto rollback;
1051                } else {
1052                        pr_err("%s: name change rollback failed: %d\n",
1053                               dev->name, ret);
1054                }
1055        }
1056
1057        return err;
1058}
1059
1060/**
1061 *      dev_set_alias - change ifalias of a device
1062 *      @dev: device
1063 *      @alias: name up to IFALIASZ
1064 *      @len: limit of bytes to copy from info
1065 *
1066 *      Set ifalias for a device,
1067 */
1068int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1069{
1070        char *new_ifalias;
1071
1072        ASSERT_RTNL();
1073
1074        if (len >= IFALIASZ)
1075                return -EINVAL;
1076
1077        if (!len) {
1078                if (dev->ifalias) {
1079                        kfree(dev->ifalias);
1080                        dev->ifalias = NULL;
1081                }
1082                return 0;
1083        }
1084
1085        new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1086        if (!new_ifalias)
1087                return -ENOMEM;
1088        dev->ifalias = new_ifalias;
1089
1090        strlcpy(dev->ifalias, alias, len+1);
1091        return len;
1092}
1093
1094
1095/**
1096 *      netdev_features_change - device changes features
1097 *      @dev: device to cause notification
1098 *
1099 *      Called to indicate a device has changed features.
1100 */
1101void netdev_features_change(struct net_device *dev)
1102{
1103        call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1104}
1105EXPORT_SYMBOL(netdev_features_change);
1106
1107/**
1108 *      netdev_state_change - device changes state
1109 *      @dev: device to cause notification
1110 *
1111 *      Called to indicate a device has changed state. This function calls
1112 *      the notifier chains for netdev_chain and sends a NEWLINK message
1113 *      to the routing socket.
1114 */
1115void netdev_state_change(struct net_device *dev)
1116{
1117        if (dev->flags & IFF_UP) {
1118                call_netdevice_notifiers(NETDEV_CHANGE, dev);
1119                rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1120        }
1121}
1122EXPORT_SYMBOL(netdev_state_change);
1123
1124/**
1125 *      netdev_notify_peers - notify network peers about existence of @dev
1126 *      @dev: network device
1127 *
1128 * Generate traffic such that interested network peers are aware of
1129 * @dev, such as by generating a gratuitous ARP. This may be used when
1130 * a device wants to inform the rest of the network about some sort of
1131 * reconfiguration such as a failover event or virtual machine
1132 * migration.
1133 */
1134void netdev_notify_peers(struct net_device *dev)
1135{
1136        rtnl_lock();
1137        call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1138        rtnl_unlock();
1139}
1140EXPORT_SYMBOL(netdev_notify_peers);
1141
1142/**
1143 *      dev_load        - load a network module
1144 *      @net: the applicable net namespace
1145 *      @name: name of interface
1146 *
1147 *      If a network interface is not present and the process has suitable
1148 *      privileges this function loads the module. If module loading is not
1149 *      available in this kernel then it becomes a nop.
1150 */
1151
1152void dev_load(struct net *net, const char *name)
1153{
1154        struct net_device *dev;
1155        int no_module;
1156
1157        rcu_read_lock();
1158        dev = dev_get_by_name_rcu(net, name);
1159        rcu_read_unlock();
1160
1161        no_module = !dev;
1162        if (no_module && capable(CAP_NET_ADMIN))
1163                no_module = request_module("netdev-%s", name);
1164        if (no_module && capable(CAP_SYS_MODULE)) {
1165                if (!request_module("%s", name))
1166                        pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1167                                name);
1168        }
1169}
1170EXPORT_SYMBOL(dev_load);
1171
1172static int __dev_open(struct net_device *dev)
1173{
1174        const struct net_device_ops *ops = dev->netdev_ops;
1175        int ret;
1176
1177        ASSERT_RTNL();
1178
1179        if (!netif_device_present(dev))
1180                return -ENODEV;
1181
1182        ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1183        ret = notifier_to_errno(ret);
1184        if (ret)
1185                return ret;
1186
1187        set_bit(__LINK_STATE_START, &dev->state);
1188
1189        if (ops->ndo_validate_addr)
1190                ret = ops->ndo_validate_addr(dev);
1191
1192        if (!ret && ops->ndo_open)
1193                ret = ops->ndo_open(dev);
1194
1195        if (ret)
1196                clear_bit(__LINK_STATE_START, &dev->state);
1197        else {
1198                dev->flags |= IFF_UP;
1199                net_dmaengine_get();
1200                dev_set_rx_mode(dev);
1201                dev_activate(dev);
1202                add_device_randomness(dev->dev_addr, dev->addr_len);
1203        }
1204
1205        return ret;
1206}
1207
1208/**
1209 *      dev_open        - prepare an interface for use.
1210 *      @dev:   device to open
1211 *
1212 *      Takes a device from down to up state. The device's private open
1213 *      function is invoked and then the multicast lists are loaded. Finally
1214 *      the device is moved into the up state and a %NETDEV_UP message is
1215 *      sent to the netdev notifier chain.
1216 *
1217 *      Calling this function on an active interface is a nop. On a failure
1218 *      a negative errno code is returned.
1219 */
1220int dev_open(struct net_device *dev)
1221{
1222        int ret;
1223
1224        if (dev->flags & IFF_UP)
1225                return 0;
1226
1227        ret = __dev_open(dev);
1228        if (ret < 0)
1229                return ret;
1230
1231        rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1232        call_netdevice_notifiers(NETDEV_UP, dev);
1233
1234        return ret;
1235}
1236EXPORT_SYMBOL(dev_open);
1237
1238static int __dev_close_many(struct list_head *head)
1239{
1240        struct net_device *dev;
1241
1242        ASSERT_RTNL();
1243        might_sleep();
1244
1245        list_for_each_entry(dev, head, unreg_list) {
1246                call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1247
1248                clear_bit(__LINK_STATE_START, &dev->state);
1249
1250                /* Synchronize to scheduled poll. We cannot touch poll list, it
1251                 * can be even on different cpu. So just clear netif_running().
1252                 *
1253                 * dev->stop() will invoke napi_disable() on all of it's
1254                 * napi_struct instances on this device.
1255                 */
1256                smp_mb__after_clear_bit(); /* Commit netif_running(). */
1257        }
1258
1259        dev_deactivate_many(head);
1260
1261        list_for_each_entry(dev, head, unreg_list) {
1262                const struct net_device_ops *ops = dev->netdev_ops;
1263
1264                /*
1265                 *      Call the device specific close. This cannot fail.
1266                 *      Only if device is UP
1267                 *
1268                 *      We allow it to be called even after a DETACH hot-plug
1269                 *      event.
1270                 */
1271                if (ops->ndo_stop)
1272                        ops->ndo_stop(dev);
1273
1274                dev->flags &= ~IFF_UP;
1275                net_dmaengine_put();
1276        }
1277
1278        return 0;
1279}
1280
1281static int __dev_close(struct net_device *dev)
1282{
1283        int retval;
1284        LIST_HEAD(single);
1285
1286        list_add(&dev->unreg_list, &single);
1287        retval = __dev_close_many(&single);
1288        list_del(&single);
1289        return retval;
1290}
1291
1292static int dev_close_many(struct list_head *head)
1293{
1294        struct net_device *dev, *tmp;
1295        LIST_HEAD(tmp_list);
1296
1297        list_for_each_entry_safe(dev, tmp, head, unreg_list)
1298                if (!(dev->flags & IFF_UP))
1299                        list_move(&dev->unreg_list, &tmp_list);
1300
1301        __dev_close_many(head);
1302
1303        list_for_each_entry(dev, head, unreg_list) {
1304                rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1305                call_netdevice_notifiers(NETDEV_DOWN, dev);
1306        }
1307
1308        /* rollback_registered_many needs the complete original list */
1309        list_splice(&tmp_list, head);
1310        return 0;
1311}
1312
1313/**
1314 *      dev_close - shutdown an interface.
1315 *      @dev: device to shutdown
1316 *
1317 *      This function moves an active device into down state. A
1318 *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1319 *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1320 *      chain.
1321 */
1322int dev_close(struct net_device *dev)
1323{
1324        if (dev->flags & IFF_UP) {
1325                LIST_HEAD(single);
1326
1327                list_add(&dev->unreg_list, &single);
1328                dev_close_many(&single);
1329                list_del(&single);
1330        }
1331        return 0;
1332}
1333EXPORT_SYMBOL(dev_close);
1334
1335
1336/**
1337 *      dev_disable_lro - disable Large Receive Offload on a device
1338 *      @dev: device
1339 *
1340 *      Disable Large Receive Offload (LRO) on a net device.  Must be
1341 *      called under RTNL.  This is needed if received packets may be
1342 *      forwarded to another interface.
1343 */
1344void dev_disable_lro(struct net_device *dev)
1345{
1346        /*
1347         * If we're trying to disable lro on a vlan device
1348         * use the underlying physical device instead
1349         */
1350        if (is_vlan_dev(dev))
1351                dev = vlan_dev_real_dev(dev);
1352
1353        dev->wanted_features &= ~NETIF_F_LRO;
1354        netdev_update_features(dev);
1355
1356        if (unlikely(dev->features & NETIF_F_LRO))
1357                netdev_WARN(dev, "failed to disable LRO!\n");
1358}
1359EXPORT_SYMBOL(dev_disable_lro);
1360
1361
1362static int dev_boot_phase = 1;
1363
1364/**
1365 *      register_netdevice_notifier - register a network notifier block
1366 *      @nb: notifier
1367 *
1368 *      Register a notifier to be called when network device events occur.
1369 *      The notifier passed is linked into the kernel structures and must
1370 *      not be reused until it has been unregistered. A negative errno code
1371 *      is returned on a failure.
1372 *
1373 *      When registered all registration and up events are replayed
1374 *      to the new notifier to allow device to have a race free
1375 *      view of the network device list.
1376 */
1377
1378int register_netdevice_notifier(struct notifier_block *nb)
1379{
1380        struct net_device *dev;
1381        struct net_device *last;
1382        struct net *net;
1383        int err;
1384
1385        rtnl_lock();
1386        err = raw_notifier_chain_register(&netdev_chain, nb);
1387        if (err)
1388                goto unlock;
1389        if (dev_boot_phase)
1390                goto unlock;
1391        for_each_net(net) {
1392                for_each_netdev(net, dev) {
1393                        err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1394                        err = notifier_to_errno(err);
1395                        if (err)
1396                                goto rollback;
1397
1398                        if (!(dev->flags & IFF_UP))
1399                                continue;
1400
1401                        nb->notifier_call(nb, NETDEV_UP, dev);
1402                }
1403        }
1404
1405unlock:
1406        rtnl_unlock();
1407        return err;
1408
1409rollback:
1410        last = dev;
1411        for_each_net(net) {
1412                for_each_netdev(net, dev) {
1413                        if (dev == last)
1414                                goto outroll;
1415
1416                        if (dev->flags & IFF_UP) {
1417                                nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1418                                nb->notifier_call(nb, NETDEV_DOWN, dev);
1419                        }
1420                        nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1421                }
1422        }
1423
1424outroll:
1425        raw_notifier_chain_unregister(&netdev_chain, nb);
1426        goto unlock;
1427}
1428EXPORT_SYMBOL(register_netdevice_notifier);
1429
1430/**
1431 *      unregister_netdevice_notifier - unregister a network notifier block
1432 *      @nb: notifier
1433 *
1434 *      Unregister a notifier previously registered by
1435 *      register_netdevice_notifier(). The notifier is unlinked into the
1436 *      kernel structures and may then be reused. A negative errno code
1437 *      is returned on a failure.
1438 *
1439 *      After unregistering unregister and down device events are synthesized
1440 *      for all devices on the device list to the removed notifier to remove
1441 *      the need for special case cleanup code.
1442 */
1443
1444int unregister_netdevice_notifier(struct notifier_block *nb)
1445{
1446        struct net_device *dev;
1447        struct net *net;
1448        int err;
1449
1450        rtnl_lock();
1451        err = raw_notifier_chain_unregister(&netdev_chain, nb);
1452        if (err)
1453                goto unlock;
1454
1455        for_each_net(net) {
1456                for_each_netdev(net, dev) {
1457                        if (dev->flags & IFF_UP) {
1458                                nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1459                                nb->notifier_call(nb, NETDEV_DOWN, dev);
1460                        }
1461                        nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1462                }
1463        }
1464unlock:
1465        rtnl_unlock();
1466        return err;
1467}
1468EXPORT_SYMBOL(unregister_netdevice_notifier);
1469
1470/**
1471 *      call_netdevice_notifiers - call all network notifier blocks
1472 *      @val: value passed unmodified to notifier function
1473 *      @dev: net_device pointer passed unmodified to notifier function
1474 *
1475 *      Call all network notifier blocks.  Parameters and return value
1476 *      are as for raw_notifier_call_chain().
1477 */
1478
1479int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1480{
1481        ASSERT_RTNL();
1482        return raw_notifier_call_chain(&netdev_chain, val, dev);
1483}
1484EXPORT_SYMBOL(call_netdevice_notifiers);
1485
1486static struct static_key netstamp_needed __read_mostly;
1487#ifdef HAVE_JUMP_LABEL
1488/* We are not allowed to call static_key_slow_dec() from irq context
1489 * If net_disable_timestamp() is called from irq context, defer the
1490 * static_key_slow_dec() calls.
1491 */
1492static atomic_t netstamp_needed_deferred;
1493#endif
1494
1495void net_enable_timestamp(void)
1496{
1497#ifdef HAVE_JUMP_LABEL
1498        int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1499
1500        if (deferred) {
1501                while (--deferred)
1502                        static_key_slow_dec(&netstamp_needed);
1503                return;
1504        }
1505#endif
1506        WARN_ON(in_interrupt());
1507        static_key_slow_inc(&netstamp_needed);
1508}
1509EXPORT_SYMBOL(net_enable_timestamp);
1510
1511void net_disable_timestamp(void)
1512{
1513#ifdef HAVE_JUMP_LABEL
1514        if (in_interrupt()) {
1515                atomic_inc(&netstamp_needed_deferred);
1516                return;
1517        }
1518#endif
1519        static_key_slow_dec(&netstamp_needed);
1520}
1521EXPORT_SYMBOL(net_disable_timestamp);
1522
1523static inline void net_timestamp_set(struct sk_buff *skb)
1524{
1525        skb->tstamp.tv64 = 0;
1526        if (static_key_false(&netstamp_needed))
1527                __net_timestamp(skb);
1528}
1529
1530#define net_timestamp_check(COND, SKB)                  \
1531        if (static_key_false(&netstamp_needed)) {               \
1532                if ((COND) && !(SKB)->tstamp.tv64)      \
1533                        __net_timestamp(SKB);           \
1534        }                                               \
1535
1536static int net_hwtstamp_validate(struct ifreq *ifr)
1537{
1538        struct hwtstamp_config cfg;
1539        enum hwtstamp_tx_types tx_type;
1540        enum hwtstamp_rx_filters rx_filter;
1541        int tx_type_valid = 0;
1542        int rx_filter_valid = 0;
1543
1544        if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1545                return -EFAULT;
1546
1547        if (cfg.flags) /* reserved for future extensions */
1548                return -EINVAL;
1549
1550        tx_type = cfg.tx_type;
1551        rx_filter = cfg.rx_filter;
1552
1553        switch (tx_type) {
1554        case HWTSTAMP_TX_OFF:
1555        case HWTSTAMP_TX_ON:
1556        case HWTSTAMP_TX_ONESTEP_SYNC:
1557                tx_type_valid = 1;
1558                break;
1559        }
1560
1561        switch (rx_filter) {
1562        case HWTSTAMP_FILTER_NONE:
1563        case HWTSTAMP_FILTER_ALL:
1564        case HWTSTAMP_FILTER_SOME:
1565        case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1566        case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1567        case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1568        case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1569        case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1570        case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1571        case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1572        case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1573        case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1574        case HWTSTAMP_FILTER_PTP_V2_EVENT:
1575        case HWTSTAMP_FILTER_PTP_V2_SYNC:
1576        case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1577                rx_filter_valid = 1;
1578                break;
1579        }
1580
1581        if (!tx_type_valid || !rx_filter_valid)
1582                return -ERANGE;
1583
1584        return 0;
1585}
1586
1587static inline bool is_skb_forwardable(struct net_device *dev,
1588                                      struct sk_buff *skb)
1589{
1590        unsigned int len;
1591
1592        if (!(dev->flags & IFF_UP))
1593                return false;
1594
1595        len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1596        if (skb->len <= len)
1597                return true;
1598
1599        /* if TSO is enabled, we don't care about the length as the packet
1600         * could be forwarded without being segmented before
1601         */
1602        if (skb_is_gso(skb))
1603                return true;
1604
1605        return false;
1606}
1607
1608/**
1609 * dev_forward_skb - loopback an skb to another netif
1610 *
1611 * @dev: destination network device
1612 * @skb: buffer to forward
1613 *
1614 * return values:
1615 *      NET_RX_SUCCESS  (no congestion)
1616 *      NET_RX_DROP     (packet was dropped, but freed)
1617 *
1618 * dev_forward_skb can be used for injecting an skb from the
1619 * start_xmit function of one device into the receive queue
1620 * of another device.
1621 *
1622 * The receiving device may be in another namespace, so
1623 * we have to clear all information in the skb that could
1624 * impact namespace isolation.
1625 */
1626int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1627{
1628        if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1629                if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1630                        atomic_long_inc(&dev->rx_dropped);
1631                        kfree_skb(skb);
1632                        return NET_RX_DROP;
1633                }
1634        }
1635
1636        skb_orphan(skb);
1637        nf_reset(skb);
1638
1639        if (unlikely(!is_skb_forwardable(dev, skb))) {
1640                atomic_long_inc(&dev->rx_dropped);
1641                kfree_skb(skb);
1642                return NET_RX_DROP;
1643        }
1644        skb->skb_iif = 0;
1645        skb->dev = dev;
1646        skb_dst_drop(skb);
1647        skb->tstamp.tv64 = 0;
1648        skb->pkt_type = PACKET_HOST;
1649        skb->protocol = eth_type_trans(skb, dev);
1650        skb->mark = 0;
1651        secpath_reset(skb);
1652        nf_reset(skb);
1653        return netif_rx(skb);
1654}
1655EXPORT_SYMBOL_GPL(dev_forward_skb);
1656
1657static inline int deliver_skb(struct sk_buff *skb,
1658                              struct packet_type *pt_prev,
1659                              struct net_device *orig_dev)
1660{
1661        if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1662                return -ENOMEM;
1663        atomic_inc(&skb->users);
1664        return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1665}
1666
1667static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1668{
1669        if (!ptype->af_packet_priv || !skb->sk)
1670                return false;
1671
1672        if (ptype->id_match)
1673                return ptype->id_match(ptype, skb->sk);
1674        else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1675                return true;
1676
1677        return false;
1678}
1679
1680/*
1681 *      Support routine. Sends outgoing frames to any network
1682 *      taps currently in use.
1683 */
1684
1685static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1686{
1687        struct packet_type *ptype;
1688        struct sk_buff *skb2 = NULL;
1689        struct packet_type *pt_prev = NULL;
1690
1691        rcu_read_lock();
1692        list_for_each_entry_rcu(ptype, &ptype_all, list) {
1693                /* Never send packets back to the socket
1694                 * they originated from - MvS (miquels@drinkel.ow.org)
1695                 */
1696                if ((ptype->dev == dev || !ptype->dev) &&
1697                    (!skb_loop_sk(ptype, skb))) {
1698                        if (pt_prev) {
1699                                deliver_skb(skb2, pt_prev, skb->dev);
1700                                pt_prev = ptype;
1701                                continue;
1702                        }
1703
1704                        skb2 = skb_clone(skb, GFP_ATOMIC);
1705                        if (!skb2)
1706                                break;
1707
1708                        net_timestamp_set(skb2);
1709
1710                        /* skb->nh should be correctly
1711                           set by sender, so that the second statement is
1712                           just protection against buggy protocols.
1713                         */
1714                        skb_reset_mac_header(skb2);
1715
1716                        if (skb_network_header(skb2) < skb2->data ||
1717                            skb2->network_header > skb2->tail) {
1718                                net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1719                                                     ntohs(skb2->protocol),
1720                                                     dev->name);
1721                                skb_reset_network_header(skb2);
1722                        }
1723
1724                        skb2->transport_header = skb2->network_header;
1725                        skb2->pkt_type = PACKET_OUTGOING;
1726                        pt_prev = ptype;
1727                }
1728        }
1729        if (pt_prev)
1730                pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1731        rcu_read_unlock();
1732}
1733
1734/**
1735 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1736 * @dev: Network device
1737 * @txq: number of queues available
1738 *
1739 * If real_num_tx_queues is changed the tc mappings may no longer be
1740 * valid. To resolve this verify the tc mapping remains valid and if
1741 * not NULL the mapping. With no priorities mapping to this
1742 * offset/count pair it will no longer be used. In the worst case TC0
1743 * is invalid nothing can be done so disable priority mappings. If is
1744 * expected that drivers will fix this mapping if they can before
1745 * calling netif_set_real_num_tx_queues.
1746 */
1747static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1748{
1749        int i;
1750        struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1751
1752        /* If TC0 is invalidated disable TC mapping */
1753        if (tc->offset + tc->count > txq) {
1754                pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1755                dev->num_tc = 0;
1756                return;
1757        }
1758
1759        /* Invalidated prio to tc mappings set to TC0 */
1760        for (i = 1; i < TC_BITMASK + 1; i++) {
1761                int q = netdev_get_prio_tc_map(dev, i);
1762
1763                tc = &dev->tc_to_txq[q];
1764                if (tc->offset + tc->count > txq) {
1765                        pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1766                                i, q);
1767                        netdev_set_prio_tc_map(dev, i, 0);
1768                }
1769        }
1770}
1771
1772/*
1773 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1774 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1775 */
1776int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1777{
1778        int rc;
1779
1780        if (txq < 1 || txq > dev->num_tx_queues)
1781                return -EINVAL;
1782
1783        if (dev->reg_state == NETREG_REGISTERED ||
1784            dev->reg_state == NETREG_UNREGISTERING) {
1785                ASSERT_RTNL();
1786
1787                rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1788                                                  txq);
1789                if (rc)
1790                        return rc;
1791
1792                if (dev->num_tc)
1793                        netif_setup_tc(dev, txq);
1794
1795                if (txq < dev->real_num_tx_queues)
1796                        qdisc_reset_all_tx_gt(dev, txq);
1797        }
1798
1799        dev->real_num_tx_queues = txq;
1800        return 0;
1801}
1802EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1803
1804#ifdef CONFIG_RPS
1805/**
1806 *      netif_set_real_num_rx_queues - set actual number of RX queues used
1807 *      @dev: Network device
1808 *      @rxq: Actual number of RX queues
1809 *
1810 *      This must be called either with the rtnl_lock held or before
1811 *      registration of the net device.  Returns 0 on success, or a
1812 *      negative error code.  If called before registration, it always
1813 *      succeeds.
1814 */
1815int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1816{
1817        int rc;
1818
1819        if (rxq < 1 || rxq > dev->num_rx_queues)
1820                return -EINVAL;
1821
1822        if (dev->reg_state == NETREG_REGISTERED) {
1823                ASSERT_RTNL();
1824
1825                rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1826                                                  rxq);
1827                if (rc)
1828                        return rc;
1829        }
1830
1831        dev->real_num_rx_queues = rxq;
1832        return 0;
1833}
1834EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1835#endif
1836
1837/**
1838 * netif_get_num_default_rss_queues - default number of RSS queues
1839 *
1840 * This routine should set an upper limit on the number of RSS queues
1841 * used by default by multiqueue devices.
1842 */
1843int netif_get_num_default_rss_queues(void)
1844{
1845        return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
1846}
1847EXPORT_SYMBOL(netif_get_num_default_rss_queues);
1848
1849static inline void __netif_reschedule(struct Qdisc *q)
1850{
1851        struct softnet_data *sd;
1852        unsigned long flags;
1853
1854        local_irq_save(flags);
1855        sd = &__get_cpu_var(softnet_data);
1856        q->next_sched = NULL;
1857        *sd->output_queue_tailp = q;
1858        sd->output_queue_tailp = &q->next_sched;
1859        raise_softirq_irqoff(NET_TX_SOFTIRQ);
1860        local_irq_restore(flags);
1861}
1862
1863void __netif_schedule(struct Qdisc *q)
1864{
1865        if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1866                __netif_reschedule(q);
1867}
1868EXPORT_SYMBOL(__netif_schedule);
1869
1870void dev_kfree_skb_irq(struct sk_buff *skb)
1871{
1872        if (atomic_dec_and_test(&skb->users)) {
1873                struct softnet_data *sd;
1874                unsigned long flags;
1875
1876                local_irq_save(flags);
1877                sd = &__get_cpu_var(softnet_data);
1878                skb->next = sd->completion_queue;
1879                sd->completion_queue = skb;
1880                raise_softirq_irqoff(NET_TX_SOFTIRQ);
1881                local_irq_restore(flags);
1882        }
1883}
1884EXPORT_SYMBOL(dev_kfree_skb_irq);
1885
1886void dev_kfree_skb_any(struct sk_buff *skb)
1887{
1888        if (in_irq() || irqs_disabled())
1889                dev_kfree_skb_irq(skb);
1890        else
1891                dev_kfree_skb(skb);
1892}
1893EXPORT_SYMBOL(dev_kfree_skb_any);
1894
1895
1896/**
1897 * netif_device_detach - mark device as removed
1898 * @dev: network device
1899 *
1900 * Mark device as removed from system and therefore no longer available.
1901 */
1902void netif_device_detach(struct net_device *dev)
1903{
1904        if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1905            netif_running(dev)) {
1906                netif_tx_stop_all_queues(dev);
1907        }
1908}
1909EXPORT_SYMBOL(netif_device_detach);
1910
1911/**
1912 * netif_device_attach - mark device as attached
1913 * @dev: network device
1914 *
1915 * Mark device as attached from system and restart if needed.
1916 */
1917void netif_device_attach(struct net_device *dev)
1918{
1919        if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1920            netif_running(dev)) {
1921                netif_tx_wake_all_queues(dev);
1922                __netdev_watchdog_up(dev);
1923        }
1924}
1925EXPORT_SYMBOL(netif_device_attach);
1926
1927static void skb_warn_bad_offload(const struct sk_buff *skb)
1928{
1929        static const netdev_features_t null_features = 0;
1930        struct net_device *dev = skb->dev;
1931        const char *driver = "";
1932
1933        if (dev && dev->dev.parent)
1934                driver = dev_driver_string(dev->dev.parent);
1935
1936        WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
1937             "gso_type=%d ip_summed=%d\n",
1938             driver, dev ? &dev->features : &null_features,
1939             skb->sk ? &skb->sk->sk_route_caps : &null_features,
1940             skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
1941             skb_shinfo(skb)->gso_type, skb->ip_summed);
1942}
1943
1944/*
1945 * Invalidate hardware checksum when packet is to be mangled, and
1946 * complete checksum manually on outgoing path.
1947 */
1948int skb_checksum_help(struct sk_buff *skb)
1949{
1950        __wsum csum;
1951        int ret = 0, offset;
1952
1953        if (skb->ip_summed == CHECKSUM_COMPLETE)
1954                goto out_set_summed;
1955
1956        if (unlikely(skb_shinfo(skb)->gso_size)) {
1957                skb_warn_bad_offload(skb);
1958                return -EINVAL;
1959        }
1960
1961        offset = skb_checksum_start_offset(skb);
1962        BUG_ON(offset >= skb_headlen(skb));
1963        csum = skb_checksum(skb, offset, skb->len - offset, 0);
1964
1965        offset += skb->csum_offset;
1966        BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1967
1968        if (skb_cloned(skb) &&
1969            !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1970                ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1971                if (ret)
1972                        goto out;
1973        }
1974
1975        *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1976out_set_summed:
1977        skb->ip_summed = CHECKSUM_NONE;
1978out:
1979        return ret;
1980}
1981EXPORT_SYMBOL(skb_checksum_help);
1982
1983/**
1984 *      skb_gso_segment - Perform segmentation on skb.
1985 *      @skb: buffer to segment
1986 *      @features: features for the output path (see dev->features)
1987 *
1988 *      This function segments the given skb and returns a list of segments.
1989 *
1990 *      It may return NULL if the skb requires no segmentation.  This is
1991 *      only possible when GSO is used for verifying header integrity.
1992 */
1993struct sk_buff *skb_gso_segment(struct sk_buff *skb,
1994        netdev_features_t features)
1995{
1996        struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1997        struct packet_type *ptype;
1998        __be16 type = skb->protocol;
1999        int vlan_depth = ETH_HLEN;
2000        int err;
2001
2002        while (type == htons(ETH_P_8021Q)) {
2003                struct vlan_hdr *vh;
2004
2005                if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2006                        return ERR_PTR(-EINVAL);
2007
2008                vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2009                type = vh->h_vlan_encapsulated_proto;
2010                vlan_depth += VLAN_HLEN;
2011        }
2012
2013        skb_reset_mac_header(skb);
2014        skb->mac_len = skb->network_header - skb->mac_header;
2015        __skb_pull(skb, skb->mac_len);
2016
2017        if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2018                skb_warn_bad_offload(skb);
2019
2020                if (skb_header_cloned(skb) &&
2021                    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2022                        return ERR_PTR(err);
2023        }
2024
2025        rcu_read_lock();
2026        list_for_each_entry_rcu(ptype,
2027                        &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2028                if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
2029                        if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2030                                err = ptype->gso_send_check(skb);
2031                                segs = ERR_PTR(err);
2032                                if (err || skb_gso_ok(skb, features))
2033                                        break;
2034                                __skb_push(skb, (skb->data -
2035                                                 skb_network_header(skb)));
2036                        }
2037                        segs = ptype->gso_segment(skb, features);
2038                        break;
2039                }
2040        }
2041        rcu_read_unlock();
2042
2043        __skb_push(skb, skb->data - skb_mac_header(skb));
2044
2045        return segs;
2046}
2047EXPORT_SYMBOL(skb_gso_segment);
2048
2049/* Take action when hardware reception checksum errors are detected. */
2050#ifdef CONFIG_BUG
2051void netdev_rx_csum_fault(struct net_device *dev)
2052{
2053        if (net_ratelimit()) {
2054                pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2055                dump_stack();
2056        }
2057}
2058EXPORT_SYMBOL(netdev_rx_csum_fault);
2059#endif
2060
2061/* Actually, we should eliminate this check as soon as we know, that:
2062 * 1. IOMMU is present and allows to map all the memory.
2063 * 2. No high memory really exists on this machine.
2064 */
2065
2066static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2067{
2068#ifdef CONFIG_HIGHMEM
2069        int i;
2070        if (!(dev->features & NETIF_F_HIGHDMA)) {
2071                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2072                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2073                        if (PageHighMem(skb_frag_page(frag)))
2074                                return 1;
2075                }
2076        }
2077
2078        if (PCI_DMA_BUS_IS_PHYS) {
2079                struct device *pdev = dev->dev.parent;
2080
2081                if (!pdev)
2082                        return 0;
2083                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2084                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2085                        dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2086                        if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2087                                return 1;
2088                }
2089        }
2090#endif
2091        return 0;
2092}
2093
2094struct dev_gso_cb {
2095        void (*destructor)(struct sk_buff *skb);
2096};
2097
2098#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2099
2100static void dev_gso_skb_destructor(struct sk_buff *skb)
2101{
2102        struct dev_gso_cb *cb;
2103
2104        do {
2105                struct sk_buff *nskb = skb->next;
2106
2107                skb->next = nskb->next;
2108                nskb->next = NULL;
2109                kfree_skb(nskb);
2110        } while (skb->next);
2111
2112        cb = DEV_GSO_CB(skb);
2113        if (cb->destructor)
2114                cb->destructor(skb);
2115}
2116
2117/**
2118 *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2119 *      @skb: buffer to segment
2120 *      @features: device features as applicable to this skb
2121 *
2122 *      This function segments the given skb and stores the list of segments
2123 *      in skb->next.
2124 */
2125static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2126{
2127        struct sk_buff *segs;
2128
2129        segs = skb_gso_segment(skb, features);
2130
2131        /* Verifying header integrity only. */
2132        if (!segs)
2133                return 0;
2134
2135        if (IS_ERR(segs))
2136                return PTR_ERR(segs);
2137
2138        skb->next = segs;
2139        DEV_GSO_CB(skb)->destructor = skb->destructor;
2140        skb->destructor = dev_gso_skb_destructor;
2141
2142        return 0;
2143}
2144
2145static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2146{
2147        return ((features & NETIF_F_GEN_CSUM) ||
2148                ((features & NETIF_F_V4_CSUM) &&
2149                 protocol == htons(ETH_P_IP)) ||
2150                ((features & NETIF_F_V6_CSUM) &&
2151                 protocol == htons(ETH_P_IPV6)) ||
2152                ((features & NETIF_F_FCOE_CRC) &&
2153                 protocol == htons(ETH_P_FCOE)));
2154}
2155
2156static netdev_features_t harmonize_features(struct sk_buff *skb,
2157        __be16 protocol, netdev_features_t features)
2158{
2159        if (skb->ip_summed != CHECKSUM_NONE &&
2160            !can_checksum_protocol(features, protocol)) {
2161                features &= ~NETIF_F_ALL_CSUM;
2162                features &= ~NETIF_F_SG;
2163        } else if (illegal_highdma(skb->dev, skb)) {
2164                features &= ~NETIF_F_SG;
2165        }
2166
2167        return features;
2168}
2169
2170netdev_features_t netif_skb_features(struct sk_buff *skb)
2171{
2172        __be16 protocol = skb->protocol;
2173        netdev_features_t features = skb->dev->features;
2174
2175        if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2176                features &= ~NETIF_F_GSO_MASK;
2177
2178        if (protocol == htons(ETH_P_8021Q)) {
2179                struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2180                protocol = veh->h_vlan_encapsulated_proto;
2181        } else if (!vlan_tx_tag_present(skb)) {
2182                return harmonize_features(skb, protocol, features);
2183        }
2184
2185        features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2186
2187        if (protocol != htons(ETH_P_8021Q)) {
2188                return harmonize_features(skb, protocol, features);
2189        } else {
2190                features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2191                                NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2192                return harmonize_features(skb, protocol, features);
2193        }
2194}
2195EXPORT_SYMBOL(netif_skb_features);
2196
2197/*
2198 * Returns true if either:
2199 *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2200 *      2. skb is fragmented and the device does not support SG.
2201 */
2202static inline int skb_needs_linearize(struct sk_buff *skb,
2203                                      int features)
2204{
2205        return skb_is_nonlinear(skb) &&
2206                        ((skb_has_frag_list(skb) &&
2207                                !(features & NETIF_F_FRAGLIST)) ||
2208                        (skb_shinfo(skb)->nr_frags &&
2209                                !(features & NETIF_F_SG)));
2210}
2211
2212int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2213                        struct netdev_queue *txq)
2214{
2215        const struct net_device_ops *ops = dev->netdev_ops;
2216        int rc = NETDEV_TX_OK;
2217        unsigned int skb_len;
2218
2219        if (likely(!skb->next)) {
2220                netdev_features_t features;
2221
2222                /*
2223                 * If device doesn't need skb->dst, release it right now while
2224                 * its hot in this cpu cache
2225                 */
2226                if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2227                        skb_dst_drop(skb);
2228
2229                features = netif_skb_features(skb);
2230
2231                if (vlan_tx_tag_present(skb) &&
2232                    !(features & NETIF_F_HW_VLAN_TX)) {
2233                        skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2234                        if (unlikely(!skb))
2235                                goto out;
2236
2237                        skb->vlan_tci = 0;
2238                }
2239
2240                if (netif_needs_gso(skb, features)) {
2241                        if (unlikely(dev_gso_segment(skb, features)))
2242                                goto out_kfree_skb;
2243                        if (skb->next)
2244                                goto gso;
2245                } else {
2246                        if (skb_needs_linearize(skb, features) &&
2247                            __skb_linearize(skb))
2248                                goto out_kfree_skb;
2249
2250                        /* If packet is not checksummed and device does not
2251                         * support checksumming for this protocol, complete
2252                         * checksumming here.
2253                         */
2254                        if (skb->ip_summed == CHECKSUM_PARTIAL) {
2255                                skb_set_transport_header(skb,
2256                                        skb_checksum_start_offset(skb));
2257                                if (!(features & NETIF_F_ALL_CSUM) &&
2258                                     skb_checksum_help(skb))
2259                                        goto out_kfree_skb;
2260                        }
2261                }
2262
2263                if (!list_empty(&ptype_all))
2264                        dev_queue_xmit_nit(skb, dev);
2265
2266                skb_len = skb->len;
2267                rc = ops->ndo_start_xmit(skb, dev);
2268                trace_net_dev_xmit(skb, rc, dev, skb_len);
2269                if (rc == NETDEV_TX_OK)
2270                        txq_trans_update(txq);
2271                return rc;
2272        }
2273
2274gso:
2275        do {
2276                struct sk_buff *nskb = skb->next;
2277
2278                skb->next = nskb->next;
2279                nskb->next = NULL;
2280
2281                /*
2282                 * If device doesn't need nskb->dst, release it right now while
2283                 * its hot in this cpu cache
2284                 */
2285                if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2286                        skb_dst_drop(nskb);
2287
2288                if (!list_empty(&ptype_all))
2289                        dev_queue_xmit_nit(nskb, dev);
2290
2291                skb_len = nskb->len;
2292                rc = ops->ndo_start_xmit(nskb, dev);
2293                trace_net_dev_xmit(nskb, rc, dev, skb_len);
2294                if (unlikely(rc != NETDEV_TX_OK)) {
2295                        if (rc & ~NETDEV_TX_MASK)
2296                                goto out_kfree_gso_skb;
2297                        nskb->next = skb->next;
2298                        skb->next = nskb;
2299                        return rc;
2300                }
2301                txq_trans_update(txq);
2302                if (unlikely(netif_xmit_stopped(txq) && skb->next))
2303                        return NETDEV_TX_BUSY;
2304        } while (skb->next);
2305
2306out_kfree_gso_skb:
2307        if (likely(skb->next == NULL))
2308                skb->destructor = DEV_GSO_CB(skb)->destructor;
2309out_kfree_skb:
2310        kfree_skb(skb);
2311out:
2312        return rc;
2313}
2314
2315static u32 hashrnd __read_mostly;
2316
2317/*
2318 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2319 * to be used as a distribution range.
2320 */
2321u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2322                  unsigned int num_tx_queues)
2323{
2324        u32 hash;
2325        u16 qoffset = 0;
2326        u16 qcount = num_tx_queues;
2327
2328        if (skb_rx_queue_recorded(skb)) {
2329                hash = skb_get_rx_queue(skb);
2330                while (unlikely(hash >= num_tx_queues))
2331                        hash -= num_tx_queues;
2332                return hash;
2333        }
2334
2335        if (dev->num_tc) {
2336                u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2337                qoffset = dev->tc_to_txq[tc].offset;
2338                qcount = dev->tc_to_txq[tc].count;
2339        }
2340
2341        if (skb->sk && skb->sk->sk_hash)
2342                hash = skb->sk->sk_hash;
2343        else
2344                hash = (__force u16) skb->protocol;
2345        hash = jhash_1word(hash, hashrnd);
2346
2347        return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2348}
2349EXPORT_SYMBOL(__skb_tx_hash);
2350
2351static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2352{
2353        if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2354                net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
2355                                     dev->name, queue_index,
2356                                     dev->real_num_tx_queues);
2357                return 0;
2358        }
2359        return queue_index;
2360}
2361
2362static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2363{
2364#ifdef CONFIG_XPS
2365        struct xps_dev_maps *dev_maps;
2366        struct xps_map *map;
2367        int queue_index = -1;
2368
2369        rcu_read_lock();
2370        dev_maps = rcu_dereference(dev->xps_maps);
2371        if (dev_maps) {
2372                map = rcu_dereference(
2373                    dev_maps->cpu_map[raw_smp_processor_id()]);
2374                if (map) {
2375                        if (map->len == 1)
2376                                queue_index = map->queues[0];
2377                        else {
2378                                u32 hash;
2379                                if (skb->sk && skb->sk->sk_hash)
2380                                        hash = skb->sk->sk_hash;
2381                                else
2382                                        hash = (__force u16) skb->protocol ^
2383                                            skb->rxhash;
2384                                hash = jhash_1word(hash, hashrnd);
2385                                queue_index = map->queues[
2386                                    ((u64)hash * map->len) >> 32];
2387                        }
2388                        if (unlikely(queue_index >= dev->real_num_tx_queues))
2389                                queue_index = -1;
2390                }
2391        }
2392        rcu_read_unlock();
2393
2394        return queue_index;
2395#else
2396        return -1;
2397#endif
2398}
2399
2400struct netdev_queue *netdev_pick_tx(struct net_device *dev,
2401                                    struct sk_buff *skb)
2402{
2403        int queue_index;
2404        const struct net_device_ops *ops = dev->netdev_ops;
2405
2406        if (dev->real_num_tx_queues == 1)
2407                queue_index = 0;
2408        else if (ops->ndo_select_queue) {
2409                queue_index = ops->ndo_select_queue(dev, skb);
2410                queue_index = dev_cap_txqueue(dev, queue_index);
2411        } else {
2412                struct sock *sk = skb->sk;
2413                queue_index = sk_tx_queue_get(sk);
2414
2415                if (queue_index < 0 || skb->ooo_okay ||
2416                    queue_index >= dev->real_num_tx_queues) {
2417                        int old_index = queue_index;
2418
2419                        queue_index = get_xps_queue(dev, skb);
2420                        if (queue_index < 0)
2421                                queue_index = skb_tx_hash(dev, skb);
2422
2423                        if (queue_index != old_index && sk) {
2424                                struct dst_entry *dst =
2425                                    rcu_dereference_check(sk->sk_dst_cache, 1);
2426
2427                                if (dst && skb_dst(skb) == dst)
2428                                        sk_tx_queue_set(sk, queue_index);
2429                        }
2430                }
2431        }
2432
2433        skb_set_queue_mapping(skb, queue_index);
2434        return netdev_get_tx_queue(dev, queue_index);
2435}
2436
2437static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2438                                 struct net_device *dev,
2439                                 struct netdev_queue *txq)
2440{
2441        spinlock_t *root_lock = qdisc_lock(q);
2442        bool contended;
2443        int rc;
2444
2445        qdisc_skb_cb(skb)->pkt_len = skb->len;
2446        qdisc_calculate_pkt_len(skb, q);
2447        /*
2448         * Heuristic to force contended enqueues to serialize on a
2449         * separate lock before trying to get qdisc main lock.
2450         * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2451         * and dequeue packets faster.
2452         */
2453        contended = qdisc_is_running(q);
2454        if (unlikely(contended))
2455                spin_lock(&q->busylock);
2456
2457        spin_lock(root_lock);
2458        if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2459                kfree_skb(skb);
2460                rc = NET_XMIT_DROP;
2461        } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2462                   qdisc_run_begin(q)) {
2463                /*
2464                 * This is a work-conserving queue; there are no old skbs
2465                 * waiting to be sent out; and the qdisc is not running -
2466                 * xmit the skb directly.
2467                 */
2468                if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2469                        skb_dst_force(skb);
2470
2471                qdisc_bstats_update(q, skb);
2472
2473                if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2474                        if (unlikely(contended)) {
2475                                spin_unlock(&q->busylock);
2476                                contended = false;
2477                        }
2478                        __qdisc_run(q);
2479                } else
2480                        qdisc_run_end(q);
2481
2482                rc = NET_XMIT_SUCCESS;
2483        } else {
2484                skb_dst_force(skb);
2485                rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2486                if (qdisc_run_begin(q)) {
2487                        if (unlikely(contended)) {
2488                                spin_unlock(&q->busylock);
2489                                contended = false;
2490                        }
2491                        __qdisc_run(q);
2492                }
2493        }
2494        spin_unlock(root_lock);
2495        if (unlikely(contended))
2496                spin_unlock(&q->busylock);
2497        return rc;
2498}
2499
2500#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2501static void skb_update_prio(struct sk_buff *skb)
2502{
2503        struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2504
2505        if (!skb->priority && skb->sk && map) {
2506                unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2507
2508                if (prioidx < map->priomap_len)
2509                        skb->priority = map->priomap[prioidx];
2510        }
2511}
2512#else
2513#define skb_update_prio(skb)
2514#endif
2515
2516static DEFINE_PER_CPU(int, xmit_recursion);
2517#define RECURSION_LIMIT 10
2518
2519/**
2520 *      dev_loopback_xmit - loop back @skb
2521 *      @skb: buffer to transmit
2522 */
2523int dev_loopback_xmit(struct sk_buff *skb)
2524{
2525        skb_reset_mac_header(skb);
2526        __skb_pull(skb, skb_network_offset(skb));
2527        skb->pkt_type = PACKET_LOOPBACK;
2528        skb->ip_summed = CHECKSUM_UNNECESSARY;
2529        WARN_ON(!skb_dst(skb));
2530        skb_dst_force(skb);
2531        netif_rx_ni(skb);
2532        return 0;
2533}
2534EXPORT_SYMBOL(dev_loopback_xmit);
2535
2536/**
2537 *      dev_queue_xmit - transmit a buffer
2538 *      @skb: buffer to transmit
2539 *
2540 *      Queue a buffer for transmission to a network device. The caller must
2541 *      have set the device and priority and built the buffer before calling
2542 *      this function. The function can be called from an interrupt.
2543 *
2544 *      A negative errno code is returned on a failure. A success does not
2545 *      guarantee the frame will be transmitted as it may be dropped due
2546 *      to congestion or traffic shaping.
2547 *
2548 * -----------------------------------------------------------------------------------
2549 *      I notice this method can also return errors from the queue disciplines,
2550 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2551 *      be positive.
2552 *
2553 *      Regardless of the return value, the skb is consumed, so it is currently
2554 *      difficult to retry a send to this method.  (You can bump the ref count
2555 *      before sending to hold a reference for retry if you are careful.)
2556 *
2557 *      When calling this method, interrupts MUST be enabled.  This is because
2558 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2559 *          --BLG
2560 */
2561int dev_queue_xmit(struct sk_buff *skb)
2562{
2563        struct net_device *dev = skb->dev;
2564        struct netdev_queue *txq;
2565        struct Qdisc *q;
2566        int rc = -ENOMEM;
2567
2568        /* Disable soft irqs for various locks below. Also
2569         * stops preemption for RCU.
2570         */
2571        rcu_read_lock_bh();
2572
2573        skb_update_prio(skb);
2574
2575        txq = netdev_pick_tx(dev, skb);
2576        q = rcu_dereference_bh(txq->qdisc);
2577
2578#ifdef CONFIG_NET_CLS_ACT
2579        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2580#endif
2581        trace_net_dev_queue(skb);
2582        if (q->enqueue) {
2583                rc = __dev_xmit_skb(skb, q, dev, txq);
2584                goto out;
2585        }
2586
2587        /* The device has no queue. Common case for software devices:
2588           loopback, all the sorts of tunnels...
2589
2590           Really, it is unlikely that netif_tx_lock protection is necessary
2591           here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2592           counters.)
2593           However, it is possible, that they rely on protection
2594           made by us here.
2595
2596           Check this and shot the lock. It is not prone from deadlocks.
2597           Either shot noqueue qdisc, it is even simpler 8)
2598         */
2599        if (dev->flags & IFF_UP) {
2600                int cpu = smp_processor_id(); /* ok because BHs are off */
2601
2602                if (txq->xmit_lock_owner != cpu) {
2603
2604                        if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2605                                goto recursion_alert;
2606
2607                        HARD_TX_LOCK(dev, txq, cpu);
2608
2609                        if (!netif_xmit_stopped(txq)) {
2610                                __this_cpu_inc(xmit_recursion);
2611                                rc = dev_hard_start_xmit(skb, dev, txq);
2612                                __this_cpu_dec(xmit_recursion);
2613                                if (dev_xmit_complete(rc)) {
2614                                        HARD_TX_UNLOCK(dev, txq);
2615                                        goto out;
2616                                }
2617                        }
2618                        HARD_TX_UNLOCK(dev, txq);
2619                        net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2620                                             dev->name);
2621                } else {
2622                        /* Recursion is detected! It is possible,
2623                         * unfortunately
2624                         */
2625recursion_alert:
2626                        net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2627                                             dev->name);
2628                }
2629        }
2630
2631        rc = -ENETDOWN;
2632        rcu_read_unlock_bh();
2633
2634        kfree_skb(skb);
2635        return rc;
2636out:
2637        rcu_read_unlock_bh();
2638        return rc;
2639}
2640EXPORT_SYMBOL(dev_queue_xmit);
2641
2642
2643/*=======================================================================
2644                        Receiver routines
2645  =======================================================================*/
2646
2647int netdev_max_backlog __read_mostly = 1000;
2648EXPORT_SYMBOL(netdev_max_backlog);
2649
2650int netdev_tstamp_prequeue __read_mostly = 1;
2651int netdev_budget __read_mostly = 300;
2652int weight_p __read_mostly = 64;            /* old backlog weight */
2653
2654/* Called with irq disabled */
2655static inline void ____napi_schedule(struct softnet_data *sd,
2656                                     struct napi_struct *napi)
2657{
2658        list_add_tail(&napi->poll_list, &sd->poll_list);
2659        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2660}
2661
2662/*
2663 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2664 * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
2665 * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
2666 * if hash is a canonical 4-tuple hash over transport ports.
2667 */
2668void __skb_get_rxhash(struct sk_buff *skb)
2669{
2670        struct flow_keys keys;
2671        u32 hash;
2672
2673        if (!skb_flow_dissect(skb, &keys))
2674                return;
2675
2676        if (keys.ports)
2677                skb->l4_rxhash = 1;
2678
2679        /* get a consistent hash (same value on both flow directions) */
2680        if (((__force u32)keys.dst < (__force u32)keys.src) ||
2681            (((__force u32)keys.dst == (__force u32)keys.src) &&
2682             ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) {
2683                swap(keys.dst, keys.src);
2684                swap(keys.port16[0], keys.port16[1]);
2685        }
2686
2687        hash = jhash_3words((__force u32)keys.dst,
2688                            (__force u32)keys.src,
2689                            (__force u32)keys.ports, hashrnd);
2690        if (!hash)
2691                hash = 1;
2692
2693        skb->rxhash = hash;
2694}
2695EXPORT_SYMBOL(__skb_get_rxhash);
2696
2697#ifdef CONFIG_RPS
2698
2699/* One global table that all flow-based protocols share. */
2700struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2701EXPORT_SYMBOL(rps_sock_flow_table);
2702
2703struct static_key rps_needed __read_mostly;
2704
2705static struct rps_dev_flow *
2706set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2707            struct rps_dev_flow *rflow, u16 next_cpu)
2708{
2709        if (next_cpu != RPS_NO_CPU) {
2710#ifdef CONFIG_RFS_ACCEL
2711                struct netdev_rx_queue *rxqueue;
2712                struct rps_dev_flow_table *flow_table;
2713                struct rps_dev_flow *old_rflow;
2714                u32 flow_id;
2715                u16 rxq_index;
2716                int rc;
2717
2718                /* Should we steer this flow to a different hardware queue? */
2719                if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2720                    !(dev->features & NETIF_F_NTUPLE))
2721                        goto out;
2722                rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2723                if (rxq_index == skb_get_rx_queue(skb))
2724                        goto out;
2725
2726                rxqueue = dev->_rx + rxq_index;
2727                flow_table = rcu_dereference(rxqueue->rps_flow_table);
2728                if (!flow_table)
2729                        goto out;
2730                flow_id = skb->rxhash & flow_table->mask;
2731                rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2732                                                        rxq_index, flow_id);
2733                if (rc < 0)
2734                        goto out;
2735                old_rflow = rflow;
2736                rflow = &flow_table->flows[flow_id];
2737                rflow->filter = rc;
2738                if (old_rflow->filter == rflow->filter)
2739                        old_rflow->filter = RPS_NO_FILTER;
2740        out:
2741#endif
2742                rflow->last_qtail =
2743                        per_cpu(softnet_data, next_cpu).input_queue_head;
2744        }
2745
2746        rflow->cpu = next_cpu;
2747        return rflow;
2748}
2749
2750/*
2751 * get_rps_cpu is called from netif_receive_skb and returns the target
2752 * CPU from the RPS map of the receiving queue for a given skb.
2753 * rcu_read_lock must be held on entry.
2754 */
2755static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2756                       struct rps_dev_flow **rflowp)
2757{
2758        struct netdev_rx_queue *rxqueue;
2759        struct rps_map *map;
2760        struct rps_dev_flow_table *flow_table;
2761        struct rps_sock_flow_table *sock_flow_table;
2762        int cpu = -1;
2763        u16 tcpu;
2764
2765        if (skb_rx_queue_recorded(skb)) {
2766                u16 index = skb_get_rx_queue(skb);
2767                if (unlikely(index >= dev->real_num_rx_queues)) {
2768                        WARN_ONCE(dev->real_num_rx_queues > 1,
2769                                  "%s received packet on queue %u, but number "
2770                                  "of RX queues is %u\n",
2771                                  dev->name, index, dev->real_num_rx_queues);
2772                        goto done;
2773                }
2774                rxqueue = dev->_rx + index;
2775        } else
2776                rxqueue = dev->_rx;
2777
2778        map = rcu_dereference(rxqueue->rps_map);
2779        if (map) {
2780                if (map->len == 1 &&
2781                    !rcu_access_pointer(rxqueue->rps_flow_table)) {
2782                        tcpu = map->cpus[0];
2783                        if (cpu_online(tcpu))
2784                                cpu = tcpu;
2785                        goto done;
2786                }
2787        } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2788                goto done;
2789        }
2790
2791        skb_reset_network_header(skb);
2792        if (!skb_get_rxhash(skb))
2793                goto done;
2794
2795        flow_table = rcu_dereference(rxqueue->rps_flow_table);
2796        sock_flow_table = rcu_dereference(rps_sock_flow_table);
2797        if (flow_table && sock_flow_table) {
2798                u16 next_cpu;
2799                struct rps_dev_flow *rflow;
2800
2801                rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2802                tcpu = rflow->cpu;
2803
2804                next_cpu = sock_flow_table->ents[skb->rxhash &
2805                    sock_flow_table->mask];
2806
2807                /*
2808                 * If the desired CPU (where last recvmsg was done) is
2809                 * different from current CPU (one in the rx-queue flow
2810                 * table entry), switch if one of the following holds:
2811                 *   - Current CPU is unset (equal to RPS_NO_CPU).
2812                 *   - Current CPU is offline.
2813                 *   - The current CPU's queue tail has advanced beyond the
2814                 *     last packet that was enqueued using this table entry.
2815                 *     This guarantees that all previous packets for the flow
2816                 *     have been dequeued, thus preserving in order delivery.
2817                 */
2818                if (unlikely(tcpu != next_cpu) &&
2819                    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2820                     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2821                      rflow->last_qtail)) >= 0)) {
2822                        tcpu = next_cpu;
2823                        rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2824                }
2825
2826                if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2827                        *rflowp = rflow;
2828                        cpu = tcpu;
2829                        goto done;
2830                }
2831        }
2832
2833        if (map) {
2834                tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2835
2836                if (cpu_online(tcpu)) {
2837                        cpu = tcpu;
2838                        goto done;
2839                }
2840        }
2841
2842done:
2843        return cpu;
2844}
2845
2846#ifdef CONFIG_RFS_ACCEL
2847
2848/**
2849 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2850 * @dev: Device on which the filter was set
2851 * @rxq_index: RX queue index
2852 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2853 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2854 *
2855 * Drivers that implement ndo_rx_flow_steer() should periodically call
2856 * this function for each installed filter and remove the filters for
2857 * which it returns %true.
2858 */
2859bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2860                         u32 flow_id, u16 filter_id)
2861{
2862        struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2863        struct rps_dev_flow_table *flow_table;
2864        struct rps_dev_flow *rflow;
2865        bool expire = true;
2866        int cpu;
2867
2868        rcu_read_lock();
2869        flow_table = rcu_dereference(rxqueue->rps_flow_table);
2870        if (flow_table && flow_id <= flow_table->mask) {
2871                rflow = &flow_table->flows[flow_id];
2872                cpu = ACCESS_ONCE(rflow->cpu);
2873                if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2874                    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2875                           rflow->last_qtail) <
2876                     (int)(10 * flow_table->mask)))
2877                        expire = false;
2878        }
2879        rcu_read_unlock();
2880        return expire;
2881}
2882EXPORT_SYMBOL(rps_may_expire_flow);
2883
2884#endif /* CONFIG_RFS_ACCEL */
2885
2886/* Called from hardirq (IPI) context */
2887static void rps_trigger_softirq(void *data)
2888{
2889        struct softnet_data *sd = data;
2890
2891        ____napi_schedule(sd, &sd->backlog);
2892        sd->received_rps++;
2893}
2894
2895#endif /* CONFIG_RPS */
2896
2897/*
2898 * Check if this softnet_data structure is another cpu one
2899 * If yes, queue it to our IPI list and return 1
2900 * If no, return 0
2901 */
2902static int rps_ipi_queued(struct softnet_data *sd)
2903{
2904#ifdef CONFIG_RPS
2905        struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2906
2907        if (sd != mysd) {
2908                sd->rps_ipi_next = mysd->rps_ipi_list;
2909                mysd->rps_ipi_list = sd;
2910
2911                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2912                return 1;
2913        }
2914#endif /* CONFIG_RPS */
2915        return 0;
2916}
2917
2918/*
2919 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2920 * queue (may be a remote CPU queue).
2921 */
2922static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2923                              unsigned int *qtail)
2924{
2925        struct softnet_data *sd;
2926        unsigned long flags;
2927
2928        sd = &per_cpu(softnet_data, cpu);
2929
2930        local_irq_save(flags);
2931
2932        rps_lock(sd);
2933        if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2934                if (skb_queue_len(&sd->input_pkt_queue)) {
2935enqueue:
2936                        __skb_queue_tail(&sd->input_pkt_queue, skb);
2937                        input_queue_tail_incr_save(sd, qtail);
2938                        rps_unlock(sd);
2939                        local_irq_restore(flags);
2940                        return NET_RX_SUCCESS;
2941                }
2942
2943                /* Schedule NAPI for backlog device
2944                 * We can use non atomic operation since we own the queue lock
2945                 */
2946                if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2947                        if (!rps_ipi_queued(sd))
2948                                ____napi_schedule(sd, &sd->backlog);
2949                }
2950                goto enqueue;
2951        }
2952
2953        sd->dropped++;
2954        rps_unlock(sd);
2955
2956        local_irq_restore(flags);
2957
2958        atomic_long_inc(&skb->dev->rx_dropped);
2959        kfree_skb(skb);
2960        return NET_RX_DROP;
2961}
2962
2963/**
2964 *      netif_rx        -       post buffer to the network code
2965 *      @skb: buffer to post
2966 *
2967 *      This function receives a packet from a device driver and queues it for
2968 *      the upper (protocol) levels to process.  It always succeeds. The buffer
2969 *      may be dropped during processing for congestion control or by the
2970 *      protocol layers.
2971 *
2972 *      return values:
2973 *      NET_RX_SUCCESS  (no congestion)
2974 *      NET_RX_DROP     (packet was dropped)
2975 *
2976 */
2977
2978int netif_rx(struct sk_buff *skb)
2979{
2980        int ret;
2981
2982        /* if netpoll wants it, pretend we never saw it */
2983        if (netpoll_rx(skb))
2984                return NET_RX_DROP;
2985
2986        net_timestamp_check(netdev_tstamp_prequeue, skb);
2987
2988        trace_netif_rx(skb);
2989#ifdef CONFIG_RPS
2990        if (static_key_false(&rps_needed)) {
2991                struct rps_dev_flow voidflow, *rflow = &voidflow;
2992                int cpu;
2993
2994                preempt_disable();
2995                rcu_read_lock();
2996
2997                cpu = get_rps_cpu(skb->dev, skb, &rflow);
2998                if (cpu < 0)
2999                        cpu = smp_processor_id();
3000
3001                ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3002
3003                rcu_read_unlock();
3004                preempt_enable();
3005        } else
3006#endif
3007        {
3008                unsigned int qtail;
3009                ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3010                put_cpu();
3011        }
3012        return ret;
3013}
3014EXPORT_SYMBOL(netif_rx);
3015
3016int netif_rx_ni(struct sk_buff *skb)
3017{
3018        int err;
3019
3020        preempt_disable();
3021        err = netif_rx(skb);
3022        if (local_softirq_pending())
3023                do_softirq();
3024        preempt_enable();
3025
3026        return err;
3027}
3028EXPORT_SYMBOL(netif_rx_ni);
3029
3030static void net_tx_action(struct softirq_action *h)
3031{
3032        struct softnet_data *sd = &__get_cpu_var(softnet_data);
3033
3034        if (sd->completion_queue) {
3035                struct sk_buff *clist;
3036
3037                local_irq_disable();
3038                clist = sd->completion_queue;
3039                sd->completion_queue = NULL;
3040                local_irq_enable();
3041
3042                while (clist) {
3043                        struct sk_buff *skb = clist;
3044                        clist = clist->next;
3045
3046                        WARN_ON(atomic_read(&skb->users));
3047                        trace_kfree_skb(skb, net_tx_action);
3048                        __kfree_skb(skb);
3049                }
3050        }
3051
3052        if (sd->output_queue) {
3053                struct Qdisc *head;
3054
3055                local_irq_disable();
3056                head = sd->output_queue;
3057                sd->output_queue = NULL;
3058                sd->output_queue_tailp = &sd->output_queue;
3059                local_irq_enable();
3060
3061                while (head) {
3062                        struct Qdisc *q = head;
3063                        spinlock_t *root_lock;
3064
3065                        head = head->next_sched;
3066
3067                        root_lock = qdisc_lock(q);
3068                        if (spin_trylock(root_lock)) {
3069                                smp_mb__before_clear_bit();
3070                                clear_bit(__QDISC_STATE_SCHED,
3071                                          &q->state);
3072                                qdisc_run(q);
3073                                spin_unlock(root_lock);
3074                        } else {
3075                                if (!test_bit(__QDISC_STATE_DEACTIVATED,
3076                                              &q->state)) {
3077                                        __netif_reschedule(q);
3078                                } else {
3079                                        smp_mb__before_clear_bit();
3080                                        clear_bit(__QDISC_STATE_SCHED,
3081                                                  &q->state);
3082                                }
3083                        }
3084                }
3085        }
3086}
3087
3088#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3089    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3090/* This hook is defined here for ATM LANE */
3091int (*br_fdb_test_addr_hook)(struct net_device *dev,
3092                             unsigned char *addr) __read_mostly;
3093EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3094#endif
3095
3096#ifdef CONFIG_NET_CLS_ACT
3097/* TODO: Maybe we should just force sch_ingress to be compiled in
3098 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3099 * a compare and 2 stores extra right now if we dont have it on
3100 * but have CONFIG_NET_CLS_ACT
3101 * NOTE: This doesn't stop any functionality; if you dont have
3102 * the ingress scheduler, you just can't add policies on ingress.
3103 *
3104 */
3105static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3106{
3107        struct net_device *dev = skb->dev;
3108        u32 ttl = G_TC_RTTL(skb->tc_verd);
3109        int result = TC_ACT_OK;
3110        struct Qdisc *q;
3111
3112        if (unlikely(MAX_RED_LOOP < ttl++)) {
3113                net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3114                                     skb->skb_iif, dev->ifindex);
3115                return TC_ACT_SHOT;
3116        }
3117
3118        skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3119        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3120
3121        q = rxq->qdisc;
3122        if (q != &noop_qdisc) {
3123                spin_lock(qdisc_lock(q));
3124                if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3125                        result = qdisc_enqueue_root(skb, q);
3126                spin_unlock(qdisc_lock(q));
3127        }
3128
3129        return result;
3130}
3131
3132static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3133                                         struct packet_type **pt_prev,
3134                                         int *ret, struct net_device *orig_dev)
3135{
3136        struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3137
3138        if (!rxq || rxq->qdisc == &noop_qdisc)
3139                goto out;
3140
3141        if (*pt_prev) {
3142                *ret = deliver_skb(skb, *pt_prev, orig_dev);
3143                *pt_prev = NULL;
3144        }
3145
3146        switch (ing_filter(skb, rxq)) {
3147        case TC_ACT_SHOT:
3148        case TC_ACT_STOLEN:
3149                kfree_skb(skb);
3150                return NULL;
3151        }
3152
3153out:
3154        skb->tc_verd = 0;
3155        return skb;
3156}
3157#endif
3158
3159/**
3160 *      netdev_rx_handler_register - register receive handler
3161 *      @dev: device to register a handler for
3162 *      @rx_handler: receive handler to register
3163 *      @rx_handler_data: data pointer that is used by rx handler
3164 *
3165 *      Register a receive hander for a device. This handler will then be
3166 *      called from __netif_receive_skb. A negative errno code is returned
3167 *      on a failure.
3168 *
3169 *      The caller must hold the rtnl_mutex.
3170 *
3171 *      For a general description of rx_handler, see enum rx_handler_result.
3172 */
3173int netdev_rx_handler_register(struct net_device *dev,
3174                               rx_handler_func_t *rx_handler,
3175                               void *rx_handler_data)
3176{
3177        ASSERT_RTNL();
3178
3179        if (dev->rx_handler)
3180                return -EBUSY;
3181
3182        rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3183        rcu_assign_pointer(dev->rx_handler, rx_handler);
3184
3185        return 0;
3186}
3187EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3188
3189/**
3190 *      netdev_rx_handler_unregister - unregister receive handler
3191 *      @dev: device to unregister a handler from
3192 *
3193 *      Unregister a receive hander from a device.
3194 *
3195 *      The caller must hold the rtnl_mutex.
3196 */
3197void netdev_rx_handler_unregister(struct net_device *dev)
3198{
3199
3200        ASSERT_RTNL();
3201        RCU_INIT_POINTER(dev->rx_handler, NULL);
3202        RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3203}
3204EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3205
3206/*
3207 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3208 * the special handling of PFMEMALLOC skbs.
3209 */
3210static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3211{
3212        switch (skb->protocol) {
3213        case __constant_htons(ETH_P_ARP):
3214        case __constant_htons(ETH_P_IP):
3215        case __constant_htons(ETH_P_IPV6):
3216        case __constant_htons(ETH_P_8021Q):
3217                return true;
3218        default:
3219                return false;
3220        }
3221}
3222
3223static int __netif_receive_skb(struct sk_buff *skb)
3224{
3225        struct packet_type *ptype, *pt_prev;
3226        rx_handler_func_t *rx_handler;
3227        struct net_device *orig_dev;
3228        struct net_device *null_or_dev;
3229        bool deliver_exact = false;
3230        int ret = NET_RX_DROP;
3231        __be16 type;
3232        unsigned long pflags = current->flags;
3233
3234        net_timestamp_check(!netdev_tstamp_prequeue, skb);
3235
3236        trace_netif_receive_skb(skb);
3237
3238        /*
3239         * PFMEMALLOC skbs are special, they should
3240         * - be delivered to SOCK_MEMALLOC sockets only
3241         * - stay away from userspace
3242         * - have bounded memory usage
3243         *
3244         * Use PF_MEMALLOC as this saves us from propagating the allocation
3245         * context down to all allocation sites.
3246         */
3247        if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3248                current->flags |= PF_MEMALLOC;
3249
3250        /* if we've gotten here through NAPI, check netpoll */
3251        if (netpoll_receive_skb(skb))
3252                goto out;
3253
3254        orig_dev = skb->dev;
3255
3256        skb_reset_network_header(skb);
3257        skb_reset_transport_header(skb);
3258        skb_reset_mac_len(skb);
3259
3260        pt_prev = NULL;
3261
3262        rcu_read_lock();
3263
3264another_round:
3265        skb->skb_iif = skb->dev->ifindex;
3266
3267        __this_cpu_inc(softnet_data.processed);
3268
3269        if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3270                skb = vlan_untag(skb);
3271                if (unlikely(!skb))
3272                        goto unlock;
3273        }
3274
3275#ifdef CONFIG_NET_CLS_ACT
3276        if (skb->tc_verd & TC_NCLS) {
3277                skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3278                goto ncls;
3279        }
3280#endif
3281
3282        if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3283                goto skip_taps;
3284
3285        list_for_each_entry_rcu(ptype, &ptype_all, list) {
3286                if (!ptype->dev || ptype->dev == skb->dev) {
3287                        if (pt_prev)
3288                                ret = deliver_skb(skb, pt_prev, orig_dev);
3289                        pt_prev = ptype;
3290                }
3291        }
3292
3293skip_taps:
3294#ifdef CONFIG_NET_CLS_ACT
3295        skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3296        if (!skb)
3297                goto unlock;
3298ncls:
3299#endif
3300
3301        if (sk_memalloc_socks() && skb_pfmemalloc(skb)
3302                                && !skb_pfmemalloc_protocol(skb))
3303                goto drop;
3304
3305        if (vlan_tx_tag_present(skb)) {
3306                if (pt_prev) {
3307                        ret = deliver_skb(skb, pt_prev, orig_dev);
3308                        pt_prev = NULL;
3309                }
3310                if (vlan_do_receive(&skb))
3311                        goto another_round;
3312                else if (unlikely(!skb))
3313                        goto unlock;
3314        }
3315
3316        rx_handler = rcu_dereference(skb->dev->rx_handler);
3317        if (rx_handler) {
3318                if (pt_prev) {
3319                        ret = deliver_skb(skb, pt_prev, orig_dev);
3320                        pt_prev = NULL;
3321                }
3322                switch (rx_handler(&skb)) {
3323                case RX_HANDLER_CONSUMED:
3324                        goto unlock;
3325                case RX_HANDLER_ANOTHER:
3326                        goto another_round;
3327                case RX_HANDLER_EXACT:
3328                        deliver_exact = true;
3329                case RX_HANDLER_PASS:
3330                        break;
3331                default:
3332                        BUG();
3333                }
3334        }
3335
3336        if (vlan_tx_nonzero_tag_present(skb))
3337                skb->pkt_type = PACKET_OTHERHOST;
3338
3339        /* deliver only exact match when indicated */
3340        null_or_dev = deliver_exact ? skb->dev : NULL;
3341
3342        type = skb->protocol;
3343        list_for_each_entry_rcu(ptype,
3344                        &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3345                if (ptype->type == type &&
3346                    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3347                     ptype->dev == orig_dev)) {
3348                        if (pt_prev)
3349                                ret = deliver_skb(skb, pt_prev, orig_dev);
3350                        pt_prev = ptype;
3351                }
3352        }
3353
3354        if (pt_prev) {
3355                if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3356                        goto drop;
3357                else
3358                        ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3359        } else {
3360drop:
3361                atomic_long_inc(&skb->dev->rx_dropped);
3362                kfree_skb(skb);
3363                /* Jamal, now you will not able to escape explaining
3364                 * me how you were going to use this. :-)
3365                 */
3366                ret = NET_RX_DROP;
3367        }
3368
3369unlock:
3370        rcu_read_unlock();
3371out:
3372        tsk_restore_flags(current, pflags, PF_MEMALLOC);
3373        return ret;
3374}
3375
3376/**
3377 *      netif_receive_skb - process receive buffer from network
3378 *      @skb: buffer to process
3379 *
3380 *      netif_receive_skb() is the main receive data processing function.
3381 *      It always succeeds. The buffer may be dropped during processing
3382 *      for congestion control or by the protocol layers.
3383 *
3384 *      This function may only be called from softirq context and interrupts
3385 *      should be enabled.
3386 *
3387 *      Return values (usually ignored):
3388 *      NET_RX_SUCCESS: no congestion
3389 *      NET_RX_DROP: packet was dropped
3390 */
3391int netif_receive_skb(struct sk_buff *skb)
3392{
3393        net_timestamp_check(netdev_tstamp_prequeue, skb);
3394
3395        if (skb_defer_rx_timestamp(skb))
3396                return NET_RX_SUCCESS;
3397
3398#ifdef CONFIG_RPS
3399        if (static_key_false(&rps_needed)) {
3400                struct rps_dev_flow voidflow, *rflow = &voidflow;
3401                int cpu, ret;
3402
3403                rcu_read_lock();
3404
3405                cpu = get_rps_cpu(skb->dev, skb, &rflow);
3406
3407                if (cpu >= 0) {
3408                        ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3409                        rcu_read_unlock();
3410                        return ret;
3411                }
3412                rcu_read_unlock();
3413        }
3414#endif
3415        return __netif_receive_skb(skb);
3416}
3417EXPORT_SYMBOL(netif_receive_skb);
3418
3419/* Network device is going away, flush any packets still pending
3420 * Called with irqs disabled.
3421 */
3422static void flush_backlog(void *arg)
3423{
3424        struct net_device *dev = arg;
3425        struct softnet_data *sd = &__get_cpu_var(softnet_data);
3426        struct sk_buff *skb, *tmp;
3427
3428        rps_lock(sd);
3429        skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3430                if (skb->dev == dev) {
3431                        __skb_unlink(skb, &sd->input_pkt_queue);
3432                        kfree_skb(skb);
3433                        input_queue_head_incr(sd);
3434                }
3435        }
3436        rps_unlock(sd);
3437
3438        skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3439                if (skb->dev == dev) {
3440                        __skb_unlink(skb, &sd->process_queue);
3441                        kfree_skb(skb);
3442                        input_queue_head_incr(sd);
3443                }
3444        }
3445}
3446
3447static int napi_gro_complete(struct sk_buff *skb)
3448{
3449        struct packet_type *ptype;
3450        __be16 type = skb->protocol;
3451        struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3452        int err = -ENOENT;
3453
3454        BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3455
3456        if (NAPI_GRO_CB(skb)->count == 1) {
3457                skb_shinfo(skb)->gso_size = 0;
3458                goto out;
3459        }
3460
3461        rcu_read_lock();
3462        list_for_each_entry_rcu(ptype, head, list) {
3463                if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3464                        continue;
3465
3466                err = ptype->gro_complete(skb);
3467                break;
3468        }
3469        rcu_read_unlock();
3470
3471        if (err) {
3472                WARN_ON(&ptype->list == head);
3473                kfree_skb(skb);
3474                return NET_RX_SUCCESS;
3475        }
3476
3477out:
3478        return netif_receive_skb(skb);
3479}
3480
3481/* napi->gro_list contains packets ordered by age.
3482 * youngest packets at the head of it.
3483 * Complete skbs in reverse order to reduce latencies.
3484 */
3485void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3486{
3487        struct sk_buff *skb, *prev = NULL;
3488
3489        /* scan list and build reverse chain */
3490        for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3491                skb->prev = prev;
3492                prev = skb;
3493        }
3494
3495        for (skb = prev; skb; skb = prev) {
3496                skb->next = NULL;
3497
3498                if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3499                        return;
3500
3501                prev = skb->prev;
3502                napi_gro_complete(skb);
3503                napi->gro_count--;
3504        }
3505
3506        napi->gro_list = NULL;
3507}
3508EXPORT_SYMBOL(napi_gro_flush);
3509
3510enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3511{
3512        struct sk_buff **pp = NULL;
3513        struct packet_type *ptype;
3514        __be16 type = skb->protocol;
3515        struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3516        int same_flow;
3517        int mac_len;
3518        enum gro_result ret;
3519
3520        if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3521                goto normal;
3522
3523        if (skb_is_gso(skb) || skb_has_frag_list(skb))
3524                goto normal;
3525
3526        rcu_read_lock();
3527        list_for_each_entry_rcu(ptype, head, list) {
3528                if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3529                        continue;
3530
3531                skb_set_network_header(skb, skb_gro_offset(skb));
3532                mac_len = skb->network_header - skb->mac_header;
3533                skb->mac_len = mac_len;
3534                NAPI_GRO_CB(skb)->same_flow = 0;
3535                NAPI_GRO_CB(skb)->flush = 0;
3536                NAPI_GRO_CB(skb)->free = 0;
3537
3538                pp = ptype->gro_receive(&napi->gro_list, skb);
3539                break;
3540        }
3541        rcu_read_unlock();
3542
3543        if (&ptype->list == head)
3544                goto normal;
3545
3546        same_flow = NAPI_GRO_CB(skb)->same_flow;
3547        ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3548
3549        if (pp) {
3550                struct sk_buff *nskb = *pp;
3551
3552                *pp = nskb->next;
3553                nskb->next = NULL;
3554                napi_gro_complete(nskb);
3555                napi->gro_count--;
3556        }
3557
3558        if (same_flow)
3559                goto ok;
3560
3561        if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3562                goto normal;
3563
3564        napi->gro_count++;
3565        NAPI_GRO_CB(skb)->count = 1;
3566        NAPI_GRO_CB(skb)->age = jiffies;
3567        skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3568        skb->next = napi->gro_list;
3569        napi->gro_list = skb;
3570        ret = GRO_HELD;
3571
3572pull:
3573        if (skb_headlen(skb) < skb_gro_offset(skb)) {
3574                int grow = skb_gro_offset(skb) - skb_headlen(skb);
3575
3576                BUG_ON(skb->end - skb->tail < grow);
3577
3578                memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3579
3580                skb->tail += grow;
3581                skb->data_len -= grow;
3582
3583                skb_shinfo(skb)->frags[0].page_offset += grow;
3584                skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3585
3586                if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3587                        skb_frag_unref(skb, 0);
3588                        memmove(skb_shinfo(skb)->frags,
3589                                skb_shinfo(skb)->frags + 1,
3590                                --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3591                }
3592        }
3593
3594ok:
3595        return ret;
3596
3597normal:
3598        ret = GRO_NORMAL;
3599        goto pull;
3600}
3601EXPORT_SYMBOL(dev_gro_receive);
3602
3603static inline gro_result_t
3604__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3605{
3606        struct sk_buff *p;
3607        unsigned int maclen = skb->dev->hard_header_len;
3608
3609        for (p = napi->gro_list; p; p = p->next) {
3610                unsigned long diffs;
3611
3612                diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3613                diffs |= p->vlan_tci ^ skb->vlan_tci;
3614                if (maclen == ETH_HLEN)
3615                        diffs |= compare_ether_header(skb_mac_header(p),
3616                                                      skb_gro_mac_header(skb));
3617                else if (!diffs)
3618                        diffs = memcmp(skb_mac_header(p),
3619                                       skb_gro_mac_header(skb),
3620                                       maclen);
3621                NAPI_GRO_CB(p)->same_flow = !diffs;
3622                NAPI_GRO_CB(p)->flush = 0;
3623        }
3624
3625        return dev_gro_receive(napi, skb);
3626}
3627
3628gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3629{
3630        switch (ret) {
3631        case GRO_NORMAL:
3632                if (netif_receive_skb(skb))
3633                        ret = GRO_DROP;
3634                break;
3635
3636        case GRO_DROP:
3637                kfree_skb(skb);
3638                break;
3639
3640        case GRO_MERGED_FREE:
3641                if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3642                        kmem_cache_free(skbuff_head_cache, skb);
3643                else
3644                        __kfree_skb(skb);
3645                break;
3646
3647        case GRO_HELD:
3648        case GRO_MERGED:
3649                break;
3650        }
3651
3652        return ret;
3653}
3654EXPORT_SYMBOL(napi_skb_finish);
3655
3656static void skb_gro_reset_offset(struct sk_buff *skb)
3657{
3658        const struct skb_shared_info *pinfo = skb_shinfo(skb);
3659        const skb_frag_t *frag0 = &pinfo->frags[0];
3660
3661        NAPI_GRO_CB(skb)->data_offset = 0;
3662        NAPI_GRO_CB(skb)->frag0 = NULL;
3663        NAPI_GRO_CB(skb)->frag0_len = 0;
3664
3665        if (skb->mac_header == skb->tail &&
3666            pinfo->nr_frags &&
3667            !PageHighMem(skb_frag_page(frag0))) {
3668                NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3669                NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3670        }
3671}
3672
3673gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3674{
3675        skb_gro_reset_offset(skb);
3676
3677        return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3678}
3679EXPORT_SYMBOL(napi_gro_receive);
3680
3681static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3682{
3683        __skb_pull(skb, skb_headlen(skb));
3684        /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3685        skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3686        skb->vlan_tci = 0;
3687        skb->dev = napi->dev;
3688        skb->skb_iif = 0;
3689
3690        napi->skb = skb;
3691}
3692
3693struct sk_buff *napi_get_frags(struct napi_struct *napi)
3694{
3695        struct sk_buff *skb = napi->skb;
3696
3697        if (!skb) {
3698                skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3699                if (skb)
3700                        napi->skb = skb;
3701        }
3702        return skb;
3703}
3704EXPORT_SYMBOL(napi_get_frags);
3705
3706gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3707                               gro_result_t ret)
3708{
3709        switch (ret) {
3710        case GRO_NORMAL:
3711        case GRO_HELD:
3712                skb->protocol = eth_type_trans(skb, skb->dev);
3713
3714                if (ret == GRO_HELD)
3715                        skb_gro_pull(skb, -ETH_HLEN);
3716                else if (netif_receive_skb(skb))
3717                        ret = GRO_DROP;
3718                break;
3719
3720        case GRO_DROP:
3721        case GRO_MERGED_FREE:
3722                napi_reuse_skb(napi, skb);
3723                break;
3724
3725        case GRO_MERGED:
3726                break;
3727        }
3728
3729        return ret;
3730}
3731EXPORT_SYMBOL(napi_frags_finish);
3732
3733static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3734{
3735        struct sk_buff *skb = napi->skb;
3736        struct ethhdr *eth;
3737        unsigned int hlen;
3738        unsigned int off;
3739
3740        napi->skb = NULL;
3741
3742        skb_reset_mac_header(skb);
3743        skb_gro_reset_offset(skb);
3744
3745        off = skb_gro_offset(skb);
3746        hlen = off + sizeof(*eth);
3747        eth = skb_gro_header_fast(skb, off);
3748        if (skb_gro_header_hard(skb, hlen)) {
3749                eth = skb_gro_header_slow(skb, hlen, off);
3750                if (unlikely(!eth)) {
3751                        napi_reuse_skb(napi, skb);
3752                        skb = NULL;
3753                        goto out;
3754                }
3755        }
3756
3757        skb_gro_pull(skb, sizeof(*eth));
3758
3759        /*
3760         * This works because the only protocols we care about don't require
3761         * special handling.  We'll fix it up properly at the end.
3762         */
3763        skb->protocol = eth->h_proto;
3764
3765out:
3766        return skb;
3767}
3768
3769gro_result_t napi_gro_frags(struct napi_struct *napi)
3770{
3771        struct sk_buff *skb = napi_frags_skb(napi);
3772
3773        if (!skb)
3774                return GRO_DROP;
3775
3776        return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3777}
3778EXPORT_SYMBOL(napi_gro_frags);
3779
3780/*
3781 * net_rps_action sends any pending IPI's for rps.
3782 * Note: called with local irq disabled, but exits with local irq enabled.
3783 */
3784static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3785{
3786#ifdef CONFIG_RPS
3787        struct softnet_data *remsd = sd->rps_ipi_list;
3788
3789        if (remsd) {
3790                sd->rps_ipi_list = NULL;
3791
3792                local_irq_enable();
3793
3794                /* Send pending IPI's to kick RPS processing on remote cpus. */
3795                while (remsd) {
3796                        struct softnet_data *next = remsd->rps_ipi_next;
3797
3798                        if (cpu_online(remsd->cpu))
3799                                __smp_call_function_single(remsd->cpu,
3800                                                           &remsd->csd, 0);
3801                        remsd = next;
3802                }
3803        } else
3804#endif
3805                local_irq_enable();
3806}
3807
3808static int process_backlog(struct napi_struct *napi, int quota)
3809{
3810        int work = 0;
3811        struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3812
3813#ifdef CONFIG_RPS
3814        /* Check if we have pending ipi, its better to send them now,
3815         * not waiting net_rx_action() end.
3816         */
3817        if (sd->rps_ipi_list) {
3818                local_irq_disable();
3819                net_rps_action_and_irq_enable(sd);
3820        }
3821#endif
3822        napi->weight = weight_p;
3823        local_irq_disable();
3824        while (work < quota) {
3825                struct sk_buff *skb;
3826                unsigned int qlen;
3827
3828                while ((skb = __skb_dequeue(&sd->process_queue))) {
3829                        local_irq_enable();
3830                        __netif_receive_skb(skb);
3831                        local_irq_disable();
3832                        input_queue_head_incr(sd);
3833                        if (++work >= quota) {
3834                                local_irq_enable();
3835                                return work;
3836                        }
3837                }
3838
3839                rps_lock(sd);
3840                qlen = skb_queue_len(&sd->input_pkt_queue);
3841                if (qlen)
3842                        skb_queue_splice_tail_init(&sd->input_pkt_queue,
3843                                                   &sd->process_queue);
3844
3845                if (qlen < quota - work) {
3846                        /*
3847                         * Inline a custom version of __napi_complete().
3848                         * only current cpu owns and manipulates this napi,
3849                         * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3850                         * we can use a plain write instead of clear_bit(),
3851                         * and we dont need an smp_mb() memory barrier.
3852                         */
3853                        list_del(&napi->poll_list);
3854                        napi->state = 0;
3855
3856                        quota = work + qlen;
3857                }
3858                rps_unlock(sd);
3859        }
3860        local_irq_enable();
3861
3862        return work;
3863}
3864
3865/**
3866 * __napi_schedule - schedule for receive
3867 * @n: entry to schedule
3868 *
3869 * The entry's receive function will be scheduled to run
3870 */
3871void __napi_schedule(struct napi_struct *n)
3872{
3873        unsigned long flags;
3874
3875        local_irq_save(flags);
3876        ____napi_schedule(&__get_cpu_var(softnet_data), n);
3877        local_irq_restore(flags);
3878}
3879EXPORT_SYMBOL(__napi_schedule);
3880
3881void __napi_complete(struct napi_struct *n)
3882{
3883        BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3884        BUG_ON(n->gro_list);
3885
3886        list_del(&n->poll_list);
3887        smp_mb__before_clear_bit();
3888        clear_bit(NAPI_STATE_SCHED, &n->state);
3889}
3890EXPORT_SYMBOL(__napi_complete);
3891
3892void napi_complete(struct napi_struct *n)
3893{
3894        unsigned long flags;
3895
3896        /*
3897         * don't let napi dequeue from the cpu poll list
3898         * just in case its running on a different cpu
3899         */
3900        if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3901                return;
3902
3903        napi_gro_flush(n, false);
3904        local_irq_save(flags);
3905        __napi_complete(n);
3906        local_irq_restore(flags);
3907}
3908EXPORT_SYMBOL(napi_complete);
3909
3910void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3911                    int (*poll)(struct napi_struct *, int), int weight)
3912{
3913        INIT_LIST_HEAD(&napi->poll_list);
3914        napi->gro_count = 0;
3915        napi->gro_list = NULL;
3916        napi->skb = NULL;
3917        napi->poll = poll;
3918        napi->weight = weight;
3919        list_add(&napi->dev_list, &dev->napi_list);
3920        napi->dev = dev;
3921#ifdef CONFIG_NETPOLL
3922        spin_lock_init(&napi->poll_lock);
3923        napi->poll_owner = -1;
3924#endif
3925        set_bit(NAPI_STATE_SCHED, &napi->state);
3926}
3927EXPORT_SYMBOL(netif_napi_add);
3928
3929void netif_napi_del(struct napi_struct *napi)
3930{
3931        struct sk_buff *skb, *next;
3932
3933        list_del_init(&napi->dev_list);
3934        napi_free_frags(napi);
3935
3936        for (skb = napi->gro_list; skb; skb = next) {
3937                next = skb->next;
3938                skb->next = NULL;
3939                kfree_skb(skb);
3940        }
3941
3942        napi->gro_list = NULL;
3943        napi->gro_count = 0;
3944}
3945EXPORT_SYMBOL(netif_napi_del);
3946
3947static void net_rx_action(struct softirq_action *h)
3948{
3949        struct softnet_data *sd = &__get_cpu_var(softnet_data);
3950        unsigned long time_limit = jiffies + 2;
3951        int budget = netdev_budget;
3952        void *have;
3953
3954        local_irq_disable();
3955
3956        while (!list_empty(&sd->poll_list)) {
3957                struct napi_struct *n;
3958                int work, weight;
3959
3960                /* If softirq window is exhuasted then punt.
3961                 * Allow this to run for 2 jiffies since which will allow
3962                 * an average latency of 1.5/HZ.
3963                 */
3964                if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3965                        goto softnet_break;
3966
3967                local_irq_enable();
3968
3969                /* Even though interrupts have been re-enabled, this
3970                 * access is safe because interrupts can only add new
3971                 * entries to the tail of this list, and only ->poll()
3972                 * calls can remove this head entry from the list.
3973                 */
3974                n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3975
3976                have = netpoll_poll_lock(n);
3977
3978                weight = n->weight;
3979
3980                /* This NAPI_STATE_SCHED test is for avoiding a race
3981                 * with netpoll's poll_napi().  Only the entity which
3982                 * obtains the lock and sees NAPI_STATE_SCHED set will
3983                 * actually make the ->poll() call.  Therefore we avoid
3984                 * accidentally calling ->poll() when NAPI is not scheduled.
3985                 */
3986                work = 0;
3987                if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3988                        work = n->poll(n, weight);
3989                        trace_napi_poll(n);
3990                }
3991
3992                WARN_ON_ONCE(work > weight);
3993
3994                budget -= work;
3995
3996                local_irq_disable();
3997
3998                /* Drivers must not modify the NAPI state if they
3999                 * consume the entire weight.  In such cases this code
4000                 * still "owns" the NAPI instance and therefore can
4001                 * move the instance around on the list at-will.
4002                 */
4003                if (unlikely(work == weight)) {
4004                        if (unlikely(napi_disable_pending(n))) {
4005                                local_irq_enable();
4006                                napi_complete(n);
4007                                local_irq_disable();
4008                        } else {
4009                                if (n->gro_list) {
4010                                        /* flush too old packets
4011                                         * If HZ < 1000, flush all packets.
4012                                         */
4013                                        local_irq_enable();
4014                                        napi_gro_flush(n, HZ >= 1000);
4015                                        local_irq_disable();
4016                                }
4017                                list_move_tail(&n->poll_list, &sd->poll_list);
4018                        }
4019                }
4020
4021                netpoll_poll_unlock(have);
4022        }
4023out:
4024        net_rps_action_and_irq_enable(sd);
4025
4026#ifdef CONFIG_NET_DMA
4027        /*
4028         * There may not be any more sk_buffs coming right now, so push
4029         * any pending DMA copies to hardware
4030         */
4031        dma_issue_pending_all();
4032#endif
4033
4034        return;
4035
4036softnet_break:
4037        sd->time_squeeze++;
4038        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4039        goto out;
4040}
4041
4042static gifconf_func_t *gifconf_list[NPROTO];
4043
4044/**
4045 *      register_gifconf        -       register a SIOCGIF handler
4046 *      @family: Address family
4047 *      @gifconf: Function handler
4048 *
4049 *      Register protocol dependent address dumping routines. The handler
4050 *      that is passed must not be freed or reused until it has been replaced
4051 *      by another handler.
4052 */
4053int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
4054{
4055        if (family >= NPROTO)
4056                return -EINVAL;
4057        gifconf_list[family] = gifconf;
4058        return 0;
4059}
4060EXPORT_SYMBOL(register_gifconf);
4061
4062
4063/*
4064 *      Map an interface index to its name (SIOCGIFNAME)
4065 */
4066
4067/*
4068 *      We need this ioctl for efficient implementation of the
4069 *      if_indextoname() function required by the IPv6 API.  Without
4070 *      it, we would have to search all the interfaces to find a
4071 *      match.  --pb
4072 */
4073
4074static int dev_ifname(struct net *net, struct ifreq __user *arg)
4075{
4076        struct net_device *dev;
4077        struct ifreq ifr;
4078
4079        /*
4080         *      Fetch the caller's info block.
4081         */
4082
4083        if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4084                return -EFAULT;
4085
4086        rcu_read_lock();
4087        dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
4088        if (!dev) {
4089                rcu_read_unlock();
4090                return -ENODEV;
4091        }
4092
4093        strcpy(ifr.ifr_name, dev->name);
4094        rcu_read_unlock();
4095
4096        if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4097                return -EFAULT;
4098        return 0;
4099}
4100
4101/*
4102 *      Perform a SIOCGIFCONF call. This structure will change
4103 *      size eventually, and there is nothing I can do about it.
4104 *      Thus we will need a 'compatibility mode'.
4105 */
4106
4107static int dev_ifconf(struct net *net, char __user *arg)
4108{
4109        struct ifconf ifc;
4110        struct net_device *dev;
4111        char __user *pos;
4112        int len;
4113        int total;
4114        int i;
4115
4116        /*
4117         *      Fetch the caller's info block.
4118         */
4119
4120        if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4121                return -EFAULT;
4122
4123        pos = ifc.ifc_buf;
4124        len = ifc.ifc_len;
4125
4126        /*
4127         *      Loop over the interfaces, and write an info block for each.
4128         */
4129
4130        total = 0;
4131        for_each_netdev(net, dev) {
4132                for (i = 0; i < NPROTO; i++) {
4133                        if (gifconf_list[i]) {
4134                                int done;
4135                                if (!pos)
4136                                        done = gifconf_list[i](dev, NULL, 0);
4137                                else
4138                                        done = gifconf_list[i](dev, pos + total,
4139                                                               len - total);
4140                                if (done < 0)
4141                                        return -EFAULT;
4142                                total += done;
4143                        }
4144                }
4145        }
4146
4147        /*
4148         *      All done.  Write the updated control block back to the caller.
4149         */
4150        ifc.ifc_len = total;
4151
4152        /*
4153         *      Both BSD and Solaris return 0 here, so we do too.
4154         */
4155        return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4156}
4157
4158#ifdef CONFIG_PROC_FS
4159
4160#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4161
4162#define get_bucket(x) ((x) >> BUCKET_SPACE)
4163#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4164#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4165
4166static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4167{
4168        struct net *net = seq_file_net(seq);
4169        struct net_device *dev;
4170        struct hlist_node *p;
4171        struct hlist_head *h;
4172        unsigned int count = 0, offset = get_offset(*pos);
4173
4174        h = &net->dev_name_head[get_bucket(*pos)];
4175        hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4176                if (++count == offset)
4177                        return dev;
4178        }
4179
4180        return NULL;
4181}
4182
4183static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
4184{
4185        struct net_device *dev;
4186        unsigned int bucket;
4187
4188        do {
4189                dev = dev_from_same_bucket(seq, pos);
4190                if (dev)
4191                        return dev;
4192
4193                bucket = get_bucket(*pos) + 1;
4194                *pos = set_bucket_offset(bucket, 1);
4195        } while (bucket < NETDEV_HASHENTRIES);
4196
4197        return NULL;
4198}
4199
4200/*
4201 *      This is invoked by the /proc filesystem handler to display a device
4202 *      in detail.
4203 */
4204void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4205        __acquires(RCU)
4206{
4207        rcu_read_lock();
4208        if (!*pos)
4209                return SEQ_START_TOKEN;
4210
4211        if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4212                return NULL;
4213
4214        return dev_from_bucket(seq, pos);
4215}
4216
4217void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4218{
4219        ++*pos;
4220        return dev_from_bucket(seq, pos);
4221}
4222
4223void dev_seq_stop(struct seq_file *seq, void *v)
4224        __releases(RCU)
4225{
4226        rcu_read_unlock();
4227}
4228
4229static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4230{
4231        struct rtnl_link_stats64 temp;
4232        const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4233
4234        seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4235                   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4236                   dev->name, stats->rx_bytes, stats->rx_packets,
4237                   stats->rx_errors,
4238                   stats->rx_dropped + stats->rx_missed_errors,
4239                   stats->rx_fifo_errors,
4240                   stats->rx_length_errors + stats->rx_over_errors +
4241                    stats->rx_crc_errors + stats->rx_frame_errors,
4242                   stats->rx_compressed, stats->multicast,
4243                   stats->tx_bytes, stats->tx_packets,
4244                   stats->tx_errors, stats->tx_dropped,
4245                   stats->tx_fifo_errors, stats->collisions,
4246                   stats->tx_carrier_errors +
4247                    stats->tx_aborted_errors +
4248                    stats->tx_window_errors +
4249                    stats->tx_heartbeat_errors,
4250                   stats->tx_compressed);
4251}
4252
4253/*
4254 *      Called from the PROCfs module. This now uses the new arbitrary sized
4255 *      /proc/net interface to create /proc/net/dev
4256 */
4257static int dev_seq_show(struct seq_file *seq, void *v)
4258{
4259        if (v == SEQ_START_TOKEN)
4260                seq_puts(seq, "Inter-|   Receive                            "
4261                              "                    |  Transmit\n"
4262                              " face |bytes    packets errs drop fifo frame "
4263                              "compressed multicast|bytes    packets errs "
4264                              "drop fifo colls carrier compressed\n");
4265        else
4266                dev_seq_printf_stats(seq, v);
4267        return 0;
4268}
4269
4270static struct softnet_data *softnet_get_online(loff_t *pos)
4271{
4272        struct softnet_data *sd = NULL;
4273
4274        while (*pos < nr_cpu_ids)
4275                if (cpu_online(*pos)) {
4276                        sd = &per_cpu(softnet_data, *pos);
4277                        break;
4278                } else
4279                        ++*pos;
4280        return sd;
4281}
4282
4283static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4284{
4285        return softnet_get_online(pos);
4286}
4287
4288static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4289{
4290        ++*pos;
4291        return softnet_get_online(pos);
4292}
4293
4294static void softnet_seq_stop(struct seq_file *seq, void *v)
4295{
4296}
4297
4298static int softnet_seq_show(struct seq_file *seq, void *v)
4299{
4300        struct softnet_data *sd = v;
4301
4302        seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4303                   sd->processed, sd->dropped, sd->time_squeeze, 0,
4304                   0, 0, 0, 0, /* was fastroute */
4305                   sd->cpu_collision, sd->received_rps);
4306        return 0;
4307}
4308
4309static const struct seq_operations dev_seq_ops = {
4310        .start = dev_seq_start,
4311        .next  = dev_seq_next,
4312        .stop  = dev_seq_stop,
4313        .show  = dev_seq_show,
4314};
4315
4316static int dev_seq_open(struct inode *inode, struct file *file)
4317{
4318        return seq_open_net(inode, file, &dev_seq_ops,
4319                            sizeof(struct seq_net_private));
4320}
4321
4322static const struct file_operations dev_seq_fops = {
4323        .owner   = THIS_MODULE,
4324        .open    = dev_seq_open,
4325        .read    = seq_read,
4326        .llseek  = seq_lseek,
4327        .release = seq_release_net,
4328};
4329
4330static const struct seq_operations softnet_seq_ops = {
4331        .start = softnet_seq_start,
4332        .next  = softnet_seq_next,
4333        .stop  = softnet_seq_stop,
4334        .show  = softnet_seq_show,
4335};
4336
4337static int softnet_seq_open(struct inode *inode, struct file *file)
4338{
4339        return seq_open(file, &softnet_seq_ops);
4340}
4341
4342static const struct file_operations softnet_seq_fops = {
4343        .owner   = THIS_MODULE,
4344        .open    = softnet_seq_open,
4345        .read    = seq_read,
4346        .llseek  = seq_lseek,
4347        .release = seq_release,
4348};
4349
4350static void *ptype_get_idx(loff_t pos)
4351{
4352        struct packet_type *pt = NULL;
4353        loff_t i = 0;
4354        int t;
4355
4356        list_for_each_entry_rcu(pt, &ptype_all, list) {
4357                if (i == pos)
4358                        return pt;
4359                ++i;
4360        }
4361
4362        for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4363                list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4364                        if (i == pos)
4365                                return pt;
4366                        ++i;
4367                }
4368        }
4369        return NULL;
4370}
4371
4372static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4373        __acquires(RCU)
4374{
4375        rcu_read_lock();
4376        return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4377}
4378
4379static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4380{
4381        struct packet_type *pt;
4382        struct list_head *nxt;
4383        int hash;
4384
4385        ++*pos;
4386        if (v == SEQ_START_TOKEN)
4387                return ptype_get_idx(0);
4388
4389        pt = v;
4390        nxt = pt->list.next;
4391        if (pt->type == htons(ETH_P_ALL)) {
4392                if (nxt != &ptype_all)
4393                        goto found;
4394                hash = 0;
4395                nxt = ptype_base[0].next;
4396        } else
4397                hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4398
4399        while (nxt == &ptype_base[hash]) {
4400                if (++hash >= PTYPE_HASH_SIZE)
4401                        return NULL;
4402                nxt = ptype_base[hash].next;
4403        }
4404found:
4405        return list_entry(nxt, struct packet_type, list);
4406}
4407
4408static void ptype_seq_stop(struct seq_file *seq, void *v)
4409        __releases(RCU)
4410{
4411        rcu_read_unlock();
4412}
4413
4414static int ptype_seq_show(struct seq_file *seq, void *v)
4415{
4416        struct packet_type *pt = v;
4417
4418        if (v == SEQ_START_TOKEN)
4419                seq_puts(seq, "Type Device      Function\n");
4420        else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4421                if (pt->type == htons(ETH_P_ALL))
4422                        seq_puts(seq, "ALL ");
4423                else
4424                        seq_printf(seq, "%04x", ntohs(pt->type));
4425
4426                seq_printf(seq, " %-8s %pF\n",
4427                           pt->dev ? pt->dev->name : "", pt->func);
4428        }
4429
4430        return 0;
4431}
4432
4433static const struct seq_operations ptype_seq_ops = {
4434        .start = ptype_seq_start,
4435        .next  = ptype_seq_next,
4436        .stop  = ptype_seq_stop,
4437        .show  = ptype_seq_show,
4438};
4439
4440static int ptype_seq_open(struct inode *inode, struct file *file)
4441{
4442        return seq_open_net(inode, file, &ptype_seq_ops,
4443                        sizeof(struct seq_net_private));
4444}
4445
4446static const struct file_operations ptype_seq_fops = {
4447        .owner   = THIS_MODULE,
4448        .open    = ptype_seq_open,
4449        .read    = seq_read,
4450        .llseek  = seq_lseek,
4451        .release = seq_release_net,
4452};
4453
4454
4455static int __net_init dev_proc_net_init(struct net *net)
4456{
4457        int rc = -ENOMEM;
4458
4459        if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4460                goto out;
4461        if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4462                goto out_dev;
4463        if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4464                goto out_softnet;
4465
4466        if (wext_proc_init(net))
4467                goto out_ptype;
4468        rc = 0;
4469out:
4470        return rc;
4471out_ptype:
4472        proc_net_remove(net, "ptype");
4473out_softnet:
4474        proc_net_remove(net, "softnet_stat");
4475out_dev:
4476        proc_net_remove(net, "dev");
4477        goto out;
4478}
4479
4480static void __net_exit dev_proc_net_exit(struct net *net)
4481{
4482        wext_proc_exit(net);
4483
4484        proc_net_remove(net, "ptype");
4485        proc_net_remove(net, "softnet_stat");
4486        proc_net_remove(net, "dev");
4487}
4488
4489static struct pernet_operations __net_initdata dev_proc_ops = {
4490        .init = dev_proc_net_init,
4491        .exit = dev_proc_net_exit,
4492};
4493
4494static int __init dev_proc_init(void)
4495{
4496        return register_pernet_subsys(&dev_proc_ops);
4497}
4498#else
4499#define dev_proc_init() 0
4500#endif  /* CONFIG_PROC_FS */
4501
4502
4503/**
4504 *      netdev_set_master       -       set up master pointer
4505 *      @slave: slave device
4506 *      @master: new master device
4507 *
4508 *      Changes the master device of the slave. Pass %NULL to break the
4509 *      bonding. The caller must hold the RTNL semaphore. On a failure
4510 *      a negative errno code is returned. On success the reference counts
4511 *      are adjusted and the function returns zero.
4512 */
4513int netdev_set_master(struct net_device *slave, struct net_device *master)
4514{
4515        struct net_device *old = slave->master;
4516
4517        ASSERT_RTNL();
4518
4519        if (master) {
4520                if (old)
4521                        return -EBUSY;
4522                dev_hold(master);
4523        }
4524
4525        slave->master = master;
4526
4527        if (old)
4528                dev_put(old);
4529        return 0;
4530}
4531EXPORT_SYMBOL(netdev_set_master);
4532
4533/**
4534 *      netdev_set_bond_master  -       set up bonding master/slave pair
4535 *      @slave: slave device
4536 *      @master: new master device
4537 *
4538 *      Changes the master device of the slave. Pass %NULL to break the
4539 *      bonding. The caller must hold the RTNL semaphore. On a failure
4540 *      a negative errno code is returned. On success %RTM_NEWLINK is sent
4541 *      to the routing socket and the function returns zero.
4542 */
4543int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4544{
4545        int err;
4546
4547        ASSERT_RTNL();
4548
4549        err = netdev_set_master(slave, master);
4550        if (err)
4551                return err;
4552        if (master)
4553                slave->flags |= IFF_SLAVE;
4554        else
4555                slave->flags &= ~IFF_SLAVE;
4556
4557        rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4558        return 0;
4559}
4560EXPORT_SYMBOL(netdev_set_bond_master);
4561
4562static void dev_change_rx_flags(struct net_device *dev, int flags)
4563{
4564        const struct net_device_ops *ops = dev->netdev_ops;
4565
4566        if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4567                ops->ndo_change_rx_flags(dev, flags);
4568}
4569
4570static int __dev_set_promiscuity(struct net_device *dev, int inc)
4571{
4572        unsigned int old_flags = dev->flags;
4573        kuid_t uid;
4574        kgid_t gid;
4575
4576        ASSERT_RTNL();
4577
4578        dev->flags |= IFF_PROMISC;
4579        dev->promiscuity += inc;
4580        if (dev->promiscuity == 0) {
4581                /*
4582                 * Avoid overflow.
4583                 * If inc causes overflow, untouch promisc and return error.
4584                 */
4585                if (inc < 0)
4586                        dev->flags &= ~IFF_PROMISC;
4587                else {
4588                        dev->promiscuity -= inc;
4589                        pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4590                                dev->name);
4591                        return -EOVERFLOW;
4592                }
4593        }
4594        if (dev->flags != old_flags) {
4595                pr_info("device %s %s promiscuous mode\n",
4596                        dev->name,
4597                        dev->flags & IFF_PROMISC ? "entered" : "left");
4598                if (audit_enabled) {
4599                        current_uid_gid(&uid, &gid);
4600                        audit_log(current->audit_context, GFP_ATOMIC,
4601                                AUDIT_ANOM_PROMISCUOUS,
4602                                "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4603                                dev->name, (dev->flags & IFF_PROMISC),
4604                                (old_flags & IFF_PROMISC),
4605                                from_kuid(&init_user_ns, audit_get_loginuid(current)),
4606                                from_kuid(&init_user_ns, uid),
4607                                from_kgid(&init_user_ns, gid),
4608                                audit_get_sessionid(current));
4609                }
4610
4611                dev_change_rx_flags(dev, IFF_PROMISC);
4612        }
4613        return 0;
4614}
4615
4616/**
4617 *      dev_set_promiscuity     - update promiscuity count on a device
4618 *      @dev: device
4619 *      @inc: modifier
4620 *
4621 *      Add or remove promiscuity from a device. While the count in the device
4622 *      remains above zero the interface remains promiscuous. Once it hits zero
4623 *      the device reverts back to normal filtering operation. A negative inc
4624 *      value is used to drop promiscuity on the device.
4625 *      Return 0 if successful or a negative errno code on error.
4626 */
4627int dev_set_promiscuity(struct net_device *dev, int inc)
4628{
4629        unsigned int old_flags = dev->flags;
4630        int err;
4631
4632        err = __dev_set_promiscuity(dev, inc);
4633        if (err < 0)
4634                return err;
4635        if (dev->flags != old_flags)
4636                dev_set_rx_mode(dev);
4637        return err;
4638}
4639EXPORT_SYMBOL(dev_set_promiscuity);
4640
4641/**
4642 *      dev_set_allmulti        - update allmulti count on a device
4643 *      @dev: device
4644 *      @inc: modifier
4645 *
4646 *      Add or remove reception of all multicast frames to a device. While the
4647 *      count in the device remains above zero the interface remains listening
4648 *      to all interfaces. Once it hits zero the device reverts back to normal
4649 *      filtering operation. A negative @inc value is used to drop the counter
4650 *      when releasing a resource needing all multicasts.
4651 *      Return 0 if successful or a negative errno code on error.
4652 */
4653
4654int dev_set_allmulti(struct net_device *dev, int inc)
4655{
4656        unsigned int old_flags = dev->flags;
4657
4658        ASSERT_RTNL();
4659
4660        dev->flags |= IFF_ALLMULTI;
4661        dev->allmulti += inc;
4662        if (dev->allmulti == 0) {
4663                /*
4664                 * Avoid overflow.
4665                 * If inc causes overflow, untouch allmulti and return error.
4666                 */
4667                if (inc < 0)
4668                        dev->flags &= ~IFF_ALLMULTI;
4669                else {
4670                        dev->allmulti -= inc;
4671                        pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4672                                dev->name);
4673                        return -EOVERFLOW;
4674                }
4675        }
4676        if (dev->flags ^ old_flags) {
4677                dev_change_rx_flags(dev, IFF_ALLMULTI);
4678                dev_set_rx_mode(dev);
4679        }
4680        return 0;
4681}
4682EXPORT_SYMBOL(dev_set_allmulti);
4683
4684/*
4685 *      Upload unicast and multicast address lists to device and
4686 *      configure RX filtering. When the device doesn't support unicast
4687 *      filtering it is put in promiscuous mode while unicast addresses
4688 *      are present.
4689 */
4690void __dev_set_rx_mode(struct net_device *dev)
4691{
4692        const struct net_device_ops *ops = dev->netdev_ops;
4693
4694        /* dev_open will call this function so the list will stay sane. */
4695        if (!(dev->flags&IFF_UP))
4696                return;
4697
4698        if (!netif_device_present(dev))
4699                return;
4700
4701        if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4702                /* Unicast addresses changes may only happen under the rtnl,
4703                 * therefore calling __dev_set_promiscuity here is safe.
4704                 */
4705                if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4706                        __dev_set_promiscuity(dev, 1);
4707                        dev->uc_promisc = true;
4708                } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4709                        __dev_set_promiscuity(dev, -1);
4710                        dev->uc_promisc = false;
4711                }
4712        }
4713
4714        if (ops->ndo_set_rx_mode)
4715                ops->ndo_set_rx_mode(dev);
4716}
4717
4718void dev_set_rx_mode(struct net_device *dev)
4719{
4720        netif_addr_lock_bh(dev);
4721        __dev_set_rx_mode(dev);
4722        netif_addr_unlock_bh(dev);
4723}
4724
4725/**
4726 *      dev_get_flags - get flags reported to userspace
4727 *      @dev: device
4728 *
4729 *      Get the combination of flag bits exported through APIs to userspace.
4730 */
4731unsigned int dev_get_flags(const struct net_device *dev)
4732{
4733        unsigned int flags;
4734
4735        flags = (dev->flags & ~(IFF_PROMISC |
4736                                IFF_ALLMULTI |
4737                                IFF_RUNNING |
4738                                IFF_LOWER_UP |
4739                                IFF_DORMANT)) |
4740                (dev->gflags & (IFF_PROMISC |
4741                                IFF_ALLMULTI));
4742
4743        if (netif_running(dev)) {
4744                if (netif_oper_up(dev))
4745                        flags |= IFF_RUNNING;
4746                if (netif_carrier_ok(dev))
4747                        flags |= IFF_LOWER_UP;
4748                if (netif_dormant(dev))
4749                        flags |= IFF_DORMANT;
4750        }
4751
4752        return flags;
4753}
4754EXPORT_SYMBOL(dev_get_flags);
4755
4756int __dev_change_flags(struct net_device *dev, unsigned int flags)
4757{
4758        unsigned int old_flags = dev->flags;
4759        int ret;
4760
4761        ASSERT_RTNL();
4762
4763        /*
4764         *      Set the flags on our device.
4765         */
4766
4767        dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4768                               IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4769                               IFF_AUTOMEDIA)) |
4770                     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4771                                    IFF_ALLMULTI));
4772
4773        /*
4774         *      Load in the correct multicast list now the flags have changed.
4775         */
4776
4777        if ((old_flags ^ flags) & IFF_MULTICAST)
4778                dev_change_rx_flags(dev, IFF_MULTICAST);
4779
4780        dev_set_rx_mode(dev);
4781
4782        /*
4783         *      Have we downed the interface. We handle IFF_UP ourselves
4784         *      according to user attempts to set it, rather than blindly
4785         *      setting it.
4786         */
4787
4788        ret = 0;
4789        if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4790                ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4791
4792                if (!ret)
4793                        dev_set_rx_mode(dev);
4794        }
4795
4796        if ((flags ^ dev->gflags) & IFF_PROMISC) {
4797                int inc = (flags & IFF_PROMISC) ? 1 : -1;
4798
4799                dev->gflags ^= IFF_PROMISC;
4800                dev_set_promiscuity(dev, inc);
4801        }
4802
4803        /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4804           is important. Some (broken) drivers set IFF_PROMISC, when
4805           IFF_ALLMULTI is requested not asking us and not reporting.
4806         */
4807        if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4808                int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4809
4810                dev->gflags ^= IFF_ALLMULTI;
4811                dev_set_allmulti(dev, inc);
4812        }
4813
4814        return ret;
4815}
4816
4817void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4818{
4819        unsigned int changes = dev->flags ^ old_flags;
4820
4821        if (changes & IFF_UP) {
4822                if (dev->flags & IFF_UP)
4823                        call_netdevice_notifiers(NETDEV_UP, dev);
4824                else
4825                        call_netdevice_notifiers(NETDEV_DOWN, dev);
4826        }
4827
4828        if (dev->flags & IFF_UP &&
4829            (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4830                call_netdevice_notifiers(NETDEV_CHANGE, dev);
4831}
4832
4833/**
4834 *      dev_change_flags - change device settings
4835 *      @dev: device
4836 *      @flags: device state flags
4837 *
4838 *      Change settings on device based state flags. The flags are
4839 *      in the userspace exported format.
4840 */
4841int dev_change_flags(struct net_device *dev, unsigned int flags)
4842{
4843        int ret;
4844        unsigned int changes, old_flags = dev->flags;
4845
4846        ret = __dev_change_flags(dev, flags);
4847        if (ret < 0)
4848                return ret;
4849
4850        changes = old_flags ^ dev->flags;
4851        if (changes)
4852                rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4853
4854        __dev_notify_flags(dev, old_flags);
4855        return ret;
4856}
4857EXPORT_SYMBOL(dev_change_flags);
4858
4859/**
4860 *      dev_set_mtu - Change maximum transfer unit
4861 *      @dev: device
4862 *      @new_mtu: new transfer unit
4863 *
4864 *      Change the maximum transfer size of the network device.
4865 */
4866int dev_set_mtu(struct net_device *dev, int new_mtu)
4867{
4868        const struct net_device_ops *ops = dev->netdev_ops;
4869        int err;
4870
4871        if (new_mtu == dev->mtu)
4872                return 0;
4873
4874        /*      MTU must be positive.    */
4875        if (new_mtu < 0)
4876                return -EINVAL;
4877
4878        if (!netif_device_present(dev))
4879                return -ENODEV;
4880
4881        err = 0;
4882        if (ops->ndo_change_mtu)
4883                err = ops->ndo_change_mtu(dev, new_mtu);
4884        else
4885                dev->mtu = new_mtu;
4886
4887        if (!err && dev->flags & IFF_UP)
4888                call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4889        return err;
4890}
4891EXPORT_SYMBOL(dev_set_mtu);
4892
4893/**
4894 *      dev_set_group - Change group this device belongs to
4895 *      @dev: device
4896 *      @new_group: group this device should belong to
4897 */
4898void dev_set_group(struct net_device *dev, int new_group)
4899{
4900        dev->group = new_group;
4901}
4902EXPORT_SYMBOL(dev_set_group);
4903
4904/**
4905 *      dev_set_mac_address - Change Media Access Control Address
4906 *      @dev: device
4907 *      @sa: new address
4908 *
4909 *      Change the hardware (MAC) address of the device
4910 */
4911int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4912{
4913        const struct net_device_ops *ops = dev->netdev_ops;
4914        int err;
4915
4916        if (!ops->ndo_set_mac_address)
4917                return -EOPNOTSUPP;
4918        if (sa->sa_family != dev->type)
4919                return -EINVAL;
4920        if (!netif_device_present(dev))
4921                return -ENODEV;
4922        err = ops->ndo_set_mac_address(dev, sa);
4923        if (!err)
4924                call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4925        add_device_randomness(dev->dev_addr, dev->addr_len);
4926        return err;
4927}
4928EXPORT_SYMBOL(dev_set_mac_address);
4929
4930/*
4931 *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4932 */
4933static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4934{
4935        int err;
4936        struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4937
4938        if (!dev)
4939                return -ENODEV;
4940
4941        switch (cmd) {
4942        case SIOCGIFFLAGS:      /* Get interface flags */
4943                ifr->ifr_flags = (short) dev_get_flags(dev);
4944                return 0;
4945
4946        case SIOCGIFMETRIC:     /* Get the metric on the interface
4947                                   (currently unused) */
4948                ifr->ifr_metric = 0;
4949                return 0;
4950
4951        case SIOCGIFMTU:        /* Get the MTU of a device */
4952                ifr->ifr_mtu = dev->mtu;
4953                return 0;
4954
4955        case SIOCGIFHWADDR:
4956                if (!dev->addr_len)
4957                        memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4958                else
4959                        memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4960                               min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4961                ifr->ifr_hwaddr.sa_family = dev->type;
4962                return 0;
4963
4964        case SIOCGIFSLAVE:
4965                err = -EINVAL;
4966                break;
4967
4968        case SIOCGIFMAP:
4969                ifr->ifr_map.mem_start = dev->mem_start;
4970                ifr->ifr_map.mem_end   = dev->mem_end;
4971                ifr->ifr_map.base_addr = dev->base_addr;
4972                ifr->ifr_map.irq       = dev->irq;
4973                ifr->ifr_map.dma       = dev->dma;
4974                ifr->ifr_map.port      = dev->if_port;
4975                return 0;
4976
4977        case SIOCGIFINDEX:
4978                ifr->ifr_ifindex = dev->ifindex;
4979                return 0;
4980
4981        case SIOCGIFTXQLEN:
4982                ifr->ifr_qlen = dev->tx_queue_len;
4983                return 0;
4984
4985        default:
4986                /* dev_ioctl() should ensure this case
4987                 * is never reached
4988                 */
4989                WARN_ON(1);
4990                err = -ENOTTY;
4991                break;
4992
4993        }
4994        return err;
4995}
4996
4997/*
4998 *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4999 */
5000static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
5001{
5002        int err;
5003        struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
5004        const struct net_device_ops *ops;
5005
5006        if (!dev)
5007                return -ENODEV;
5008
5009        ops = dev->netdev_ops;
5010
5011        switch (cmd) {
5012        case SIOCSIFFLAGS:      /* Set interface flags */
5013                return dev_change_flags(dev, ifr->ifr_flags);
5014
5015        case SIOCSIFMETRIC:     /* Set the metric on the interface
5016                                   (currently unused) */
5017                return -EOPNOTSUPP;
5018
5019        case SIOCSIFMTU:        /* Set the MTU of a device */
5020                return dev_set_mtu(dev, ifr->ifr_mtu);
5021
5022        case SIOCSIFHWADDR:
5023                return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
5024
5025        case SIOCSIFHWBROADCAST:
5026                if (ifr->ifr_hwaddr.sa_family != dev->type)
5027                        return -EINVAL;
5028                memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
5029                       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5030                call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5031                return 0;
5032
5033        case SIOCSIFMAP:
5034                if (ops->ndo_set_config) {
5035                        if (!netif_device_present(dev))
5036                                return -ENODEV;
5037                        return ops->ndo_set_config(dev, &ifr->ifr_map);
5038                }
5039                return -EOPNOTSUPP;
5040
5041        case SIOCADDMULTI:
5042                if (!ops->ndo_set_rx_mode ||
5043                    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5044                        return -EINVAL;
5045                if (!netif_device_present(dev))
5046                        return -ENODEV;
5047                return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
5048
5049        case SIOCDELMULTI:
5050                if (!ops->ndo_set_rx_mode ||
5051                    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5052                        return -EINVAL;
5053                if (!netif_device_present(dev))
5054                        return -ENODEV;
5055                return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
5056
5057        case SIOCSIFTXQLEN:
5058                if (ifr->ifr_qlen < 0)
5059                        return -EINVAL;
5060                dev->tx_queue_len = ifr->ifr_qlen;
5061                return 0;
5062
5063        case SIOCSIFNAME:
5064                ifr->ifr_newname[IFNAMSIZ-1] = '\0';
5065                return dev_change_name(dev, ifr->ifr_newname);
5066
5067        case SIOCSHWTSTAMP:
5068                err = net_hwtstamp_validate(ifr);
5069                if (err)
5070                        return err;
5071                /* fall through */
5072
5073        /*
5074         *      Unknown or private ioctl
5075         */
5076        default:
5077                if ((cmd >= SIOCDEVPRIVATE &&
5078                    cmd <= SIOCDEVPRIVATE + 15) ||
5079                    cmd == SIOCBONDENSLAVE ||
5080                    cmd == SIOCBONDRELEASE ||
5081                    cmd == SIOCBONDSETHWADDR ||
5082                    cmd == SIOCBONDSLAVEINFOQUERY ||
5083                    cmd == SIOCBONDINFOQUERY ||
5084                    cmd == SIOCBONDCHANGEACTIVE ||
5085                    cmd == SIOCGMIIPHY ||
5086                    cmd == SIOCGMIIREG ||
5087                    cmd == SIOCSMIIREG ||
5088                    cmd == SIOCBRADDIF ||
5089                    cmd == SIOCBRDELIF ||
5090                    cmd == SIOCSHWTSTAMP ||
5091                    cmd == SIOCWANDEV) {
5092                        err = -EOPNOTSUPP;
5093                        if (ops->ndo_do_ioctl) {
5094                                if (netif_device_present(dev))
5095                                        err = ops->ndo_do_ioctl(dev, ifr, cmd);
5096                                else
5097                                        err = -ENODEV;
5098                        }
5099                } else
5100                        err = -EINVAL;
5101
5102        }
5103        return err;
5104}
5105
5106/*
5107 *      This function handles all "interface"-type I/O control requests. The actual
5108 *      'doing' part of this is dev_ifsioc above.
5109 */
5110
5111/**
5112 *      dev_ioctl       -       network device ioctl
5113 *      @net: the applicable net namespace
5114 *      @cmd: command to issue
5115 *      @arg: pointer to a struct ifreq in user space
5116 *
5117 *      Issue ioctl functions to devices. This is normally called by the
5118 *      user space syscall interfaces but can sometimes be useful for
5119 *      other purposes. The return value is the return from the syscall if
5120 *      positive or a negative errno code on error.
5121 */
5122
5123int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5124{
5125        struct ifreq ifr;
5126        int ret;
5127        char *colon;
5128
5129        /* One special case: SIOCGIFCONF takes ifconf argument
5130           and requires shared lock, because it sleeps writing
5131           to user space.
5132         */
5133
5134        if (cmd == SIOCGIFCONF) {
5135                rtnl_lock();
5136                ret = dev_ifconf(net, (char __user *) arg);
5137                rtnl_unlock();
5138                return ret;
5139        }
5140        if (cmd == SIOCGIFNAME)
5141                return dev_ifname(net, (struct ifreq __user *)arg);
5142
5143        if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5144                return -EFAULT;
5145
5146        ifr.ifr_name[IFNAMSIZ-1] = 0;
5147
5148        colon = strchr(ifr.ifr_name, ':');
5149        if (colon)
5150                *colon = 0;
5151
5152        /*
5153         *      See which interface the caller is talking about.
5154         */
5155
5156        switch (cmd) {
5157        /*
5158         *      These ioctl calls:
5159         *      - can be done by all.
5160         *      - atomic and do not require locking.
5161         *      - return a value
5162         */
5163        case SIOCGIFFLAGS:
5164        case SIOCGIFMETRIC:
5165        case SIOCGIFMTU:
5166        case SIOCGIFHWADDR:
5167        case SIOCGIFSLAVE:
5168        case SIOCGIFMAP:
5169        case SIOCGIFINDEX:
5170        case SIOCGIFTXQLEN:
5171                dev_load(net, ifr.ifr_name);
5172                rcu_read_lock();
5173                ret = dev_ifsioc_locked(net, &ifr, cmd);
5174                rcu_read_unlock();
5175                if (!ret) {
5176                        if (colon)
5177                                *colon = ':';
5178                        if (copy_to_user(arg, &ifr,
5179                                         sizeof(struct ifreq)))
5180                                ret = -EFAULT;
5181                }
5182                return ret;
5183
5184        case SIOCETHTOOL:
5185                dev_load(net, ifr.ifr_name);
5186                rtnl_lock();
5187                ret = dev_ethtool(net, &ifr);
5188                rtnl_unlock();
5189                if (!ret) {
5190                        if (colon)
5191                                *colon = ':';
5192                        if (copy_to_user(arg, &ifr,
5193                                         sizeof(struct ifreq)))
5194                                ret = -EFAULT;
5195                }
5196                return ret;
5197
5198        /*
5199         *      These ioctl calls:
5200         *      - require superuser power.
5201         *      - require strict serialization.
5202         *      - return a value
5203         */
5204        case SIOCGMIIPHY:
5205        case SIOCGMIIREG:
5206        case SIOCSIFNAME:
5207                if (!capable(CAP_NET_ADMIN))
5208                        return -EPERM;
5209                dev_load(net, ifr.ifr_name);
5210                rtnl_lock();
5211                ret = dev_ifsioc(net, &ifr, cmd);
5212                rtnl_unlock();
5213                if (!ret) {
5214                        if (colon)
5215                                *colon = ':';
5216                        if (copy_to_user(arg, &ifr,
5217                                         sizeof(struct ifreq)))
5218                                ret = -EFAULT;
5219                }
5220                return ret;
5221
5222        /*
5223         *      These ioctl calls:
5224         *      - require superuser power.
5225         *      - require strict serialization.
5226         *      - do not return a value
5227         */
5228        case SIOCSIFFLAGS:
5229        case SIOCSIFMETRIC:
5230        case SIOCSIFMTU:
5231        case SIOCSIFMAP:
5232        case SIOCSIFHWADDR:
5233        case SIOCSIFSLAVE:
5234        case SIOCADDMULTI:
5235        case SIOCDELMULTI:
5236        case SIOCSIFHWBROADCAST:
5237        case SIOCSIFTXQLEN:
5238        case SIOCSMIIREG:
5239        case SIOCBONDENSLAVE:
5240        case SIOCBONDRELEASE:
5241        case SIOCBONDSETHWADDR:
5242        case SIOCBONDCHANGEACTIVE:
5243        case SIOCBRADDIF:
5244        case SIOCBRDELIF:
5245        case SIOCSHWTSTAMP:
5246                if (!capable(CAP_NET_ADMIN))
5247                        return -EPERM;
5248                /* fall through */
5249        case SIOCBONDSLAVEINFOQUERY:
5250        case SIOCBONDINFOQUERY:
5251                dev_load(net, ifr.ifr_name);
5252                rtnl_lock();
5253                ret = dev_ifsioc(net, &ifr, cmd);
5254                rtnl_unlock();
5255                return ret;
5256
5257        case SIOCGIFMEM:
5258                /* Get the per device memory space. We can add this but
5259                 * currently do not support it */
5260        case SIOCSIFMEM:
5261                /* Set the per device memory buffer space.
5262                 * Not applicable in our case */
5263        case SIOCSIFLINK:
5264                return -ENOTTY;
5265
5266        /*
5267         *      Unknown or private ioctl.
5268         */
5269        default:
5270                if (cmd == SIOCWANDEV ||
5271                    (cmd >= SIOCDEVPRIVATE &&
5272                     cmd <= SIOCDEVPRIVATE + 15)) {
5273                        dev_load(net, ifr.ifr_name);
5274                        rtnl_lock();
5275                        ret = dev_ifsioc(net, &ifr, cmd);
5276                        rtnl_unlock();
5277                        if (!ret && copy_to_user(arg, &ifr,
5278                                                 sizeof(struct ifreq)))
5279                                ret = -EFAULT;
5280                        return ret;
5281                }
5282                /* Take care of Wireless Extensions */
5283                if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5284                        return wext_handle_ioctl(net, &ifr, cmd, arg);
5285                return -ENOTTY;
5286        }
5287}
5288
5289
5290/**
5291 *      dev_new_index   -       allocate an ifindex
5292 *      @net: the applicable net namespace
5293 *
5294 *      Returns a suitable unique value for a new device interface
5295 *      number.  The caller must hold the rtnl semaphore or the
5296 *      dev_base_lock to be sure it remains unique.
5297 */
5298static int dev_new_index(struct net *net)
5299{
5300        int ifindex = net->ifindex;
5301        for (;;) {
5302                if (++ifindex <= 0)
5303                        ifindex = 1;
5304                if (!__dev_get_by_index(net, ifindex))
5305                        return net->ifindex = ifindex;
5306        }
5307}
5308
5309/* Delayed registration/unregisteration */
5310static LIST_HEAD(net_todo_list);
5311
5312static void net_set_todo(struct net_device *dev)
5313{
5314        list_add_tail(&dev->todo_list, &net_todo_list);
5315}
5316
5317static void rollback_registered_many(struct list_head *head)
5318{
5319        struct net_device *dev, *tmp;
5320
5321        BUG_ON(dev_boot_phase);
5322        ASSERT_RTNL();
5323
5324        list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5325                /* Some devices call without registering
5326                 * for initialization unwind. Remove those
5327                 * devices and proceed with the remaining.
5328                 */
5329                if (dev->reg_state == NETREG_UNINITIALIZED) {
5330                        pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5331                                 dev->name, dev);
5332
5333                        WARN_ON(1);
5334                        list_del(&dev->unreg_list);
5335                        continue;
5336                }
5337                dev->dismantle = true;
5338                BUG_ON(dev->reg_state != NETREG_REGISTERED);
5339        }
5340
5341        /* If device is running, close it first. */
5342        dev_close_many(head);
5343
5344        list_for_each_entry(dev, head, unreg_list) {
5345                /* And unlink it from device chain. */
5346                unlist_netdevice(dev);
5347
5348                dev->reg_state = NETREG_UNREGISTERING;
5349        }
5350
5351        synchronize_net();
5352
5353        list_for_each_entry(dev, head, unreg_list) {
5354                /* Shutdown queueing discipline. */
5355                dev_shutdown(dev);
5356
5357
5358                /* Notify protocols, that we are about to destroy
5359                   this device. They should clean all the things.
5360                */
5361                call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5362
5363                if (!dev->rtnl_link_ops ||
5364                    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5365                        rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5366
5367                /*
5368                 *      Flush the unicast and multicast chains
5369                 */
5370                dev_uc_flush(dev);
5371                dev_mc_flush(dev);
5372
5373                if (dev->netdev_ops->ndo_uninit)
5374                        dev->netdev_ops->ndo_uninit(dev);
5375
5376                /* Notifier chain MUST detach us from master device. */
5377                WARN_ON(dev->master);
5378
5379                /* Remove entries from kobject tree */
5380                netdev_unregister_kobject(dev);
5381        }
5382
5383        synchronize_net();
5384
5385        list_for_each_entry(dev, head, unreg_list)
5386                dev_put(dev);
5387}
5388
5389static void rollback_registered(struct net_device *dev)
5390{
5391        LIST_HEAD(single);
5392
5393        list_add(&dev->unreg_list, &single);
5394        rollback_registered_many(&single);
5395        list_del(&single);
5396}
5397
5398static netdev_features_t netdev_fix_features(struct net_device *dev,
5399        netdev_features_t features)
5400{
5401        /* Fix illegal checksum combinations */
5402        if ((features & NETIF_F_HW_CSUM) &&
5403            (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5404                netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5405                features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5406        }
5407
5408        /* Fix illegal SG+CSUM combinations. */
5409        if ((features & NETIF_F_SG) &&
5410            !(features & NETIF_F_ALL_CSUM)) {
5411                netdev_dbg(dev,
5412                        "Dropping NETIF_F_SG since no checksum feature.\n");
5413                features &= ~NETIF_F_SG;
5414        }
5415
5416        /* TSO requires that SG is present as well. */
5417        if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5418                netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5419                features &= ~NETIF_F_ALL_TSO;
5420        }
5421
5422        /* TSO ECN requires that TSO is present as well. */
5423        if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5424                features &= ~NETIF_F_TSO_ECN;
5425
5426        /* Software GSO depends on SG. */
5427        if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5428                netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5429                features &= ~NETIF_F_GSO;
5430        }
5431
5432        /* UFO needs SG and checksumming */
5433        if (features & NETIF_F_UFO) {
5434                /* maybe split UFO into V4 and V6? */
5435                if (!((features & NETIF_F_GEN_CSUM) ||
5436                    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5437                            == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5438                        netdev_dbg(dev,
5439                                "Dropping NETIF_F_UFO since no checksum offload features.\n");
5440                        features &= ~NETIF_F_UFO;
5441                }
5442
5443                if (!(features & NETIF_F_SG)) {
5444                        netdev_dbg(dev,
5445                                "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5446                        features &= ~NETIF_F_UFO;
5447                }
5448        }
5449
5450        return features;
5451}
5452
5453int __netdev_update_features(struct net_device *dev)
5454{
5455        netdev_features_t features;
5456        int err = 0;
5457
5458        ASSERT_RTNL();
5459
5460        features = netdev_get_wanted_features(dev);
5461
5462        if (dev->netdev_ops->ndo_fix_features)
5463                features = dev->netdev_ops->ndo_fix_features(dev, features);
5464
5465        /* driver might be less strict about feature dependencies */
5466        features = netdev_fix_features(dev, features);
5467
5468        if (dev->features == features)
5469                return 0;
5470
5471        netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5472                &dev->features, &features);
5473
5474        if (dev->netdev_ops->ndo_set_features)
5475                err = dev->netdev_ops->ndo_set_features(dev, features);
5476
5477        if (unlikely(err < 0)) {
5478                netdev_err(dev,
5479                        "set_features() failed (%d); wanted %pNF, left %pNF\n",
5480                        err, &features, &dev->features);
5481                return -1;
5482        }
5483
5484        if (!err)
5485                dev->features = features;
5486
5487        return 1;
5488}
5489
5490/**
5491 *      netdev_update_features - recalculate device features
5492 *      @dev: the device to check
5493 *
5494 *      Recalculate dev->features set and send notifications if it
5495 *      has changed. Should be called after driver or hardware dependent
5496 *      conditions might have changed that influence the features.
5497 */
5498void netdev_update_features(struct net_device *dev)
5499{
5500        if (__netdev_update_features(dev))
5501                netdev_features_change(dev);
5502}
5503EXPORT_SYMBOL(netdev_update_features);
5504
5505/**
5506 *      netdev_change_features - recalculate device features
5507 *      @dev: the device to check
5508 *
5509 *      Recalculate dev->features set and send notifications even
5510 *      if they have not changed. Should be called instead of
5511 *      netdev_update_features() if also dev->vlan_features might
5512 *      have changed to allow the changes to be propagated to stacked
5513 *      VLAN devices.
5514 */
5515void netdev_change_features(struct net_device *dev)
5516{
5517        __netdev_update_features(dev);
5518        netdev_features_change(dev);
5519}
5520EXPORT_SYMBOL(netdev_change_features);
5521
5522/**
5523 *      netif_stacked_transfer_operstate -      transfer operstate
5524 *      @rootdev: the root or lower level device to transfer state from
5525 *      @dev: the device to transfer operstate to
5526 *
5527 *      Transfer operational state from root to device. This is normally
5528 *      called when a stacking relationship exists between the root
5529 *      device and the device(a leaf device).
5530 */
5531void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5532                                        struct net_device *dev)
5533{
5534        if (rootdev->operstate == IF_OPER_DORMANT)
5535                netif_dormant_on(dev);
5536        else
5537                netif_dormant_off(dev);
5538
5539        if (netif_carrier_ok(rootdev)) {
5540                if (!netif_carrier_ok(dev))
5541                        netif_carrier_on(dev);
5542        } else {
5543                if (netif_carrier_ok(dev))
5544                        netif_carrier_off(dev);
5545        }
5546}
5547EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5548
5549#ifdef CONFIG_RPS
5550static int netif_alloc_rx_queues(struct net_device *dev)
5551{
5552        unsigned int i, count = dev->num_rx_queues;
5553        struct netdev_rx_queue *rx;
5554
5555        BUG_ON(count < 1);
5556
5557        rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5558        if (!rx) {
5559                pr_err("netdev: Unable to allocate %u rx queues\n", count);
5560                return -ENOMEM;
5561        }
5562        dev->_rx = rx;
5563
5564        for (i = 0; i < count; i++)
5565                rx[i].dev = dev;
5566        return 0;
5567}
5568#endif
5569
5570static void netdev_init_one_queue(struct net_device *dev,
5571                                  struct netdev_queue *queue, void *_unused)
5572{
5573        /* Initialize queue lock */
5574        spin_lock_init(&queue->_xmit_lock);
5575        netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5576        queue->xmit_lock_owner = -1;
5577        netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5578        queue->dev = dev;
5579#ifdef CONFIG_BQL
5580        dql_init(&queue->dql, HZ);
5581#endif
5582}
5583
5584static int netif_alloc_netdev_queues(struct net_device *dev)
5585{
5586        unsigned int count = dev->num_tx_queues;
5587        struct netdev_queue *tx;
5588
5589        BUG_ON(count < 1);
5590
5591        tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5592        if (!tx) {
5593                pr_err("netdev: Unable to allocate %u tx queues\n", count);
5594                return -ENOMEM;
5595        }
5596        dev->_tx = tx;
5597
5598        netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5599        spin_lock_init(&dev->tx_global_lock);
5600
5601        return 0;
5602}
5603
5604/**
5605 *      register_netdevice      - register a network device
5606 *      @dev: device to register
5607 *
5608 *      Take a completed network device structure and add it to the kernel
5609 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5610 *      chain. 0 is returned on success. A negative errno code is returned
5611 *      on a failure to set up the device, or if the name is a duplicate.
5612 *
5613 *      Callers must hold the rtnl semaphore. You may want
5614 *      register_netdev() instead of this.
5615 *
5616 *      BUGS:
5617 *      The locking appears insufficient to guarantee two parallel registers
5618 *      will not get the same name.
5619 */
5620
5621int register_netdevice(struct net_device *dev)
5622{
5623        int ret;
5624        struct net *net = dev_net(dev);
5625
5626        BUG_ON(dev_boot_phase);
5627        ASSERT_RTNL();
5628
5629        might_sleep();
5630
5631        /* When net_device's are persistent, this will be fatal. */
5632        BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5633        BUG_ON(!net);
5634
5635        spin_lock_init(&dev->addr_list_lock);
5636        netdev_set_addr_lockdep_class(dev);
5637
5638        dev->iflink = -1;
5639
5640        ret = dev_get_valid_name(net, dev, dev->name);
5641        if (ret < 0)
5642                goto out;
5643
5644        /* Init, if this function is available */
5645        if (dev->netdev_ops->ndo_init) {
5646                ret = dev->netdev_ops->ndo_init(dev);
5647                if (ret) {
5648                        if (ret > 0)
5649                                ret = -EIO;
5650                        goto out;
5651                }
5652        }
5653
5654        ret = -EBUSY;
5655        if (!dev->ifindex)
5656                dev->ifindex = dev_new_index(net);
5657        else if (__dev_get_by_index(net, dev->ifindex))
5658                goto err_uninit;
5659
5660        if (dev->iflink == -1)
5661                dev->iflink = dev->ifindex;
5662
5663        /* Transfer changeable features to wanted_features and enable
5664         * software offloads (GSO and GRO).
5665         */
5666        dev->hw_features |= NETIF_F_SOFT_FEATURES;
5667        dev->features |= NETIF_F_SOFT_FEATURES;
5668        dev->wanted_features = dev->features & dev->hw_features;
5669
5670        /* Turn on no cache copy if HW is doing checksum */
5671        if (!(dev->flags & IFF_LOOPBACK)) {
5672                dev->hw_features |= NETIF_F_NOCACHE_COPY;
5673                if (dev->features & NETIF_F_ALL_CSUM) {
5674                        dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5675                        dev->features |= NETIF_F_NOCACHE_COPY;
5676                }
5677        }
5678
5679        /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5680         */
5681        dev->vlan_features |= NETIF_F_HIGHDMA;
5682
5683        ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5684        ret = notifier_to_errno(ret);
5685        if (ret)
5686                goto err_uninit;
5687
5688        ret = netdev_register_kobject(dev);
5689        if (ret)
5690                goto err_uninit;
5691        dev->reg_state = NETREG_REGISTERED;
5692
5693        __netdev_update_features(dev);
5694
5695        /*
5696         *      Default initial state at registry is that the
5697         *      device is present.
5698         */
5699
5700        set_bit(__LINK_STATE_PRESENT, &dev->state);
5701
5702        linkwatch_init_dev(dev);
5703
5704        dev_init_scheduler(dev);
5705        dev_hold(dev);
5706        list_netdevice(dev);
5707        add_device_randomness(dev->dev_addr, dev->addr_len);
5708
5709        /* Notify protocols, that a new device appeared. */
5710        ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5711        ret = notifier_to_errno(ret);
5712        if (ret) {
5713                rollback_registered(dev);
5714                dev->reg_state = NETREG_UNREGISTERED;
5715        }
5716        /*
5717         *      Prevent userspace races by waiting until the network
5718         *      device is fully setup before sending notifications.
5719         */
5720        if (!dev->rtnl_link_ops ||
5721            dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5722                rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5723
5724out:
5725        return ret;
5726
5727err_uninit:
5728        if (dev->netdev_ops->ndo_uninit)
5729                dev->netdev_ops->ndo_uninit(dev);
5730        goto out;
5731}
5732EXPORT_SYMBOL(register_netdevice);
5733
5734/**
5735 *      init_dummy_netdev       - init a dummy network device for NAPI
5736 *      @dev: device to init
5737 *
5738 *      This takes a network device structure and initialize the minimum
5739 *      amount of fields so it can be used to schedule NAPI polls without
5740 *      registering a full blown interface. This is to be used by drivers
5741 *      that need to tie several hardware interfaces to a single NAPI
5742 *      poll scheduler due to HW limitations.
5743 */
5744int init_dummy_netdev(struct net_device *dev)
5745{
5746        /* Clear everything. Note we don't initialize spinlocks
5747         * are they aren't supposed to be taken by any of the
5748         * NAPI code and this dummy netdev is supposed to be
5749         * only ever used for NAPI polls
5750         */
5751        memset(dev, 0, sizeof(struct net_device));
5752
5753        /* make sure we BUG if trying to hit standard
5754         * register/unregister code path
5755         */
5756        dev->reg_state = NETREG_DUMMY;
5757
5758        /* NAPI wants this */
5759        INIT_LIST_HEAD(&dev->napi_list);
5760
5761        /* a dummy interface is started by default */
5762        set_bit(__LINK_STATE_PRESENT, &dev->state);
5763        set_bit(__LINK_STATE_START, &dev->state);
5764
5765        /* Note : We dont allocate pcpu_refcnt for dummy devices,
5766         * because users of this 'device' dont need to change
5767         * its refcount.
5768         */
5769
5770        return 0;
5771}
5772EXPORT_SYMBOL_GPL(init_dummy_netdev);
5773
5774
5775/**
5776 *      register_netdev - register a network device
5777 *      @dev: device to register
5778 *
5779 *      Take a completed network device structure and add it to the kernel
5780 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5781 *      chain. 0 is returned on success. A negative errno code is returned
5782 *      on a failure to set up the device, or if the name is a duplicate.
5783 *
5784 *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5785 *      and expands the device name if you passed a format string to
5786 *      alloc_netdev.
5787 */
5788int register_netdev(struct net_device *dev)
5789{
5790        int err;
5791
5792        rtnl_lock();
5793        err = register_netdevice(dev);
5794        rtnl_unlock();
5795        return err;
5796}
5797EXPORT_SYMBOL(register_netdev);
5798
5799int netdev_refcnt_read(const struct net_device *dev)
5800{
5801        int i, refcnt = 0;
5802
5803        for_each_possible_cpu(i)
5804                refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5805        return refcnt;
5806}
5807EXPORT_SYMBOL(netdev_refcnt_read);
5808
5809/**
5810 * netdev_wait_allrefs - wait until all references are gone.
5811 * @dev: target net_device
5812 *
5813 * This is called when unregistering network devices.
5814 *
5815 * Any protocol or device that holds a reference should register
5816 * for netdevice notification, and cleanup and put back the
5817 * reference if they receive an UNREGISTER event.
5818 * We can get stuck here if buggy protocols don't correctly
5819 * call dev_put.
5820 */
5821static void netdev_wait_allrefs(struct net_device *dev)
5822{
5823        unsigned long rebroadcast_time, warning_time;
5824        int refcnt;
5825
5826        linkwatch_forget_dev(dev);
5827
5828        rebroadcast_time = warning_time = jiffies;
5829        refcnt = netdev_refcnt_read(dev);
5830
5831        while (refcnt != 0) {
5832                if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5833                        rtnl_lock();
5834
5835                        /* Rebroadcast unregister notification */
5836                        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5837
5838                        __rtnl_unlock();
5839                        rcu_barrier();
5840                        rtnl_lock();
5841
5842                        call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5843                        if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5844                                     &dev->state)) {
5845                                /* We must not have linkwatch events
5846                                 * pending on unregister. If this
5847                                 * happens, we simply run the queue
5848