linux/net/core/dev.c
<<
>>
Prefs
   1/*
   2 *      NET3    Protocol independent device support routines.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 *      Derived from the non IP parts of dev.c 1.0.19
  10 *              Authors:        Ross Biro
  11 *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *      Additional Authors:
  15 *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *              David Hinds <dahinds@users.sourceforge.net>
  18 *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *              Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *      Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *                                      to 2 if register_netdev gets called
  25 *                                      before net_dev_init & also removed a
  26 *                                      few lines of code in the process.
  27 *              Alan Cox        :       device private ioctl copies fields back.
  28 *              Alan Cox        :       Transmit queue code does relevant
  29 *                                      stunts to keep the queue safe.
  30 *              Alan Cox        :       Fixed double lock.
  31 *              Alan Cox        :       Fixed promisc NULL pointer trap
  32 *              ????????        :       Support the full private ioctl range
  33 *              Alan Cox        :       Moved ioctl permission check into
  34 *                                      drivers
  35 *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36 *              Alan Cox        :       100 backlog just doesn't cut it when
  37 *                                      you start doing multicast video 8)
  38 *              Alan Cox        :       Rewrote net_bh and list manager.
  39 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40 *              Alan Cox        :       Took out transmit every packet pass
  41 *                                      Saved a few bytes in the ioctl handler
  42 *              Alan Cox        :       Network driver sets packet type before
  43 *                                      calling netif_rx. Saves a function
  44 *                                      call a packet.
  45 *              Alan Cox        :       Hashed net_bh()
  46 *              Richard Kooijman:       Timestamp fixes.
  47 *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48 *              Alan Cox        :       Device lock protection.
  49 *              Alan Cox        :       Fixed nasty side effect of device close
  50 *                                      changes.
  51 *              Rudi Cilibrasi  :       Pass the right thing to
  52 *                                      set_mac_address()
  53 *              Dave Miller     :       32bit quantity for the device lock to
  54 *                                      make it work out on a Sparc.
  55 *              Bjorn Ekwall    :       Added KERNELD hack.
  56 *              Alan Cox        :       Cleaned up the backlog initialise.
  57 *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58 *                                      1 device.
  59 *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60 *                                      is no device open function.
  61 *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62 *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63 *              Cyrus Durgin    :       Cleaned for KMOD
  64 *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65 *                                      A network device unload needs to purge
  66 *                                      the backlog queue.
  67 *      Paul Rusty Russell      :       SIOCSIFNAME
  68 *              Pekka Riikonen  :       Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *                                      indefinitely on dev->refcnt
  71 *              J Hadi Salim    :       - Backlog queue sampling
  72 *                                      - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <asm/system.h>
  77#include <linux/bitops.h>
  78#include <linux/capability.h>
  79#include <linux/cpu.h>
  80#include <linux/types.h>
  81#include <linux/kernel.h>
  82#include <linux/hash.h>
  83#include <linux/slab.h>
  84#include <linux/sched.h>
  85#include <linux/mutex.h>
  86#include <linux/string.h>
  87#include <linux/mm.h>
  88#include <linux/socket.h>
  89#include <linux/sockios.h>
  90#include <linux/errno.h>
  91#include <linux/interrupt.h>
  92#include <linux/if_ether.h>
  93#include <linux/netdevice.h>
  94#include <linux/etherdevice.h>
  95#include <linux/ethtool.h>
  96#include <linux/notifier.h>
  97#include <linux/skbuff.h>
  98#include <net/net_namespace.h>
  99#include <net/sock.h>
 100#include <linux/rtnetlink.h>
 101#include <linux/proc_fs.h>
 102#include <linux/seq_file.h>
 103#include <linux/stat.h>
 104#include <net/dst.h>
 105#include <net/pkt_sched.h>
 106#include <net/checksum.h>
 107#include <net/xfrm.h>
 108#include <linux/highmem.h>
 109#include <linux/init.h>
 110#include <linux/kmod.h>
 111#include <linux/module.h>
 112#include <linux/netpoll.h>
 113#include <linux/rcupdate.h>
 114#include <linux/delay.h>
 115#include <net/wext.h>
 116#include <net/iw_handler.h>
 117#include <asm/current.h>
 118#include <linux/audit.h>
 119#include <linux/dmaengine.h>
 120#include <linux/err.h>
 121#include <linux/ctype.h>
 122#include <linux/if_arp.h>
 123#include <linux/if_vlan.h>
 124#include <linux/ip.h>
 125#include <net/ip.h>
 126#include <linux/ipv6.h>
 127#include <linux/in.h>
 128#include <linux/jhash.h>
 129#include <linux/random.h>
 130#include <trace/events/napi.h>
 131#include <trace/events/net.h>
 132#include <trace/events/skb.h>
 133#include <linux/pci.h>
 134#include <linux/inetdevice.h>
 135#include <linux/cpu_rmap.h>
 136
 137#include "net-sysfs.h"
 138
 139/* Instead of increasing this, you should create a hash table. */
 140#define MAX_GRO_SKBS 8
 141
 142/* This should be increased if a protocol with a bigger head is added. */
 143#define GRO_MAX_HEAD (MAX_HEADER + 128)
 144
 145/*
 146 *      The list of packet types we will receive (as opposed to discard)
 147 *      and the routines to invoke.
 148 *
 149 *      Why 16. Because with 16 the only overlap we get on a hash of the
 150 *      low nibble of the protocol value is RARP/SNAP/X.25.
 151 *
 152 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 153 *             sure which should go first, but I bet it won't make much
 154 *             difference if we are running VLANs.  The good news is that
 155 *             this protocol won't be in the list unless compiled in, so
 156 *             the average user (w/out VLANs) will not be adversely affected.
 157 *             --BLG
 158 *
 159 *              0800    IP
 160 *              8100    802.1Q VLAN
 161 *              0001    802.3
 162 *              0002    AX.25
 163 *              0004    802.2
 164 *              8035    RARP
 165 *              0005    SNAP
 166 *              0805    X.25
 167 *              0806    ARP
 168 *              8137    IPX
 169 *              0009    Localtalk
 170 *              86DD    IPv6
 171 */
 172
 173#define PTYPE_HASH_SIZE (16)
 174#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 175
 176static DEFINE_SPINLOCK(ptype_lock);
 177static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 178static struct list_head ptype_all __read_mostly;        /* Taps */
 179
 180/*
 181 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 182 * semaphore.
 183 *
 184 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 185 *
 186 * Writers must hold the rtnl semaphore while they loop through the
 187 * dev_base_head list, and hold dev_base_lock for writing when they do the
 188 * actual updates.  This allows pure readers to access the list even
 189 * while a writer is preparing to update it.
 190 *
 191 * To put it another way, dev_base_lock is held for writing only to
 192 * protect against pure readers; the rtnl semaphore provides the
 193 * protection against other writers.
 194 *
 195 * See, for example usages, register_netdevice() and
 196 * unregister_netdevice(), which must be called with the rtnl
 197 * semaphore held.
 198 */
 199DEFINE_RWLOCK(dev_base_lock);
 200EXPORT_SYMBOL(dev_base_lock);
 201
 202static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 203{
 204        unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 205        return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 206}
 207
 208static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 209{
 210        return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 211}
 212
 213static inline void rps_lock(struct softnet_data *sd)
 214{
 215#ifdef CONFIG_RPS
 216        spin_lock(&sd->input_pkt_queue.lock);
 217#endif
 218}
 219
 220static inline void rps_unlock(struct softnet_data *sd)
 221{
 222#ifdef CONFIG_RPS
 223        spin_unlock(&sd->input_pkt_queue.lock);
 224#endif
 225}
 226
 227/* Device list insertion */
 228static int list_netdevice(struct net_device *dev)
 229{
 230        struct net *net = dev_net(dev);
 231
 232        ASSERT_RTNL();
 233
 234        write_lock_bh(&dev_base_lock);
 235        list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 236        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 237        hlist_add_head_rcu(&dev->index_hlist,
 238                           dev_index_hash(net, dev->ifindex));
 239        write_unlock_bh(&dev_base_lock);
 240        return 0;
 241}
 242
 243/* Device list removal
 244 * caller must respect a RCU grace period before freeing/reusing dev
 245 */
 246static void unlist_netdevice(struct net_device *dev)
 247{
 248        ASSERT_RTNL();
 249
 250        /* Unlink dev from the device chain */
 251        write_lock_bh(&dev_base_lock);
 252        list_del_rcu(&dev->dev_list);
 253        hlist_del_rcu(&dev->name_hlist);
 254        hlist_del_rcu(&dev->index_hlist);
 255        write_unlock_bh(&dev_base_lock);
 256}
 257
 258/*
 259 *      Our notifier list
 260 */
 261
 262static RAW_NOTIFIER_HEAD(netdev_chain);
 263
 264/*
 265 *      Device drivers call our routines to queue packets here. We empty the
 266 *      queue in the local softnet handler.
 267 */
 268
 269DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 270EXPORT_PER_CPU_SYMBOL(softnet_data);
 271
 272#ifdef CONFIG_LOCKDEP
 273/*
 274 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 275 * according to dev->type
 276 */
 277static const unsigned short netdev_lock_type[] =
 278        {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 279         ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 280         ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 281         ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 282         ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 283         ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 284         ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 285         ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 286         ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 287         ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 288         ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 289         ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 290         ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 291         ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 292         ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 293         ARPHRD_VOID, ARPHRD_NONE};
 294
 295static const char *const netdev_lock_name[] =
 296        {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 297         "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 298         "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 299         "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 300         "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 301         "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 302         "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 303         "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 304         "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 305         "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 306         "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 307         "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 308         "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 309         "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 310         "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 311         "_xmit_VOID", "_xmit_NONE"};
 312
 313static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 314static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 315
 316static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 317{
 318        int i;
 319
 320        for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 321                if (netdev_lock_type[i] == dev_type)
 322                        return i;
 323        /* the last key is used by default */
 324        return ARRAY_SIZE(netdev_lock_type) - 1;
 325}
 326
 327static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 328                                                 unsigned short dev_type)
 329{
 330        int i;
 331
 332        i = netdev_lock_pos(dev_type);
 333        lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 334                                   netdev_lock_name[i]);
 335}
 336
 337static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 338{
 339        int i;
 340
 341        i = netdev_lock_pos(dev->type);
 342        lockdep_set_class_and_name(&dev->addr_list_lock,
 343                                   &netdev_addr_lock_key[i],
 344                                   netdev_lock_name[i]);
 345}
 346#else
 347static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 348                                                 unsigned short dev_type)
 349{
 350}
 351static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 352{
 353}
 354#endif
 355
 356/*******************************************************************************
 357
 358                Protocol management and registration routines
 359
 360*******************************************************************************/
 361
 362/*
 363 *      Add a protocol ID to the list. Now that the input handler is
 364 *      smarter we can dispense with all the messy stuff that used to be
 365 *      here.
 366 *
 367 *      BEWARE!!! Protocol handlers, mangling input packets,
 368 *      MUST BE last in hash buckets and checking protocol handlers
 369 *      MUST start from promiscuous ptype_all chain in net_bh.
 370 *      It is true now, do not change it.
 371 *      Explanation follows: if protocol handler, mangling packet, will
 372 *      be the first on list, it is not able to sense, that packet
 373 *      is cloned and should be copied-on-write, so that it will
 374 *      change it and subsequent readers will get broken packet.
 375 *                                                      --ANK (980803)
 376 */
 377
 378static inline struct list_head *ptype_head(const struct packet_type *pt)
 379{
 380        if (pt->type == htons(ETH_P_ALL))
 381                return &ptype_all;
 382        else
 383                return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 384}
 385
 386/**
 387 *      dev_add_pack - add packet handler
 388 *      @pt: packet type declaration
 389 *
 390 *      Add a protocol handler to the networking stack. The passed &packet_type
 391 *      is linked into kernel lists and may not be freed until it has been
 392 *      removed from the kernel lists.
 393 *
 394 *      This call does not sleep therefore it can not
 395 *      guarantee all CPU's that are in middle of receiving packets
 396 *      will see the new packet type (until the next received packet).
 397 */
 398
 399void dev_add_pack(struct packet_type *pt)
 400{
 401        struct list_head *head = ptype_head(pt);
 402
 403        spin_lock(&ptype_lock);
 404        list_add_rcu(&pt->list, head);
 405        spin_unlock(&ptype_lock);
 406}
 407EXPORT_SYMBOL(dev_add_pack);
 408
 409/**
 410 *      __dev_remove_pack        - remove packet handler
 411 *      @pt: packet type declaration
 412 *
 413 *      Remove a protocol handler that was previously added to the kernel
 414 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 415 *      from the kernel lists and can be freed or reused once this function
 416 *      returns.
 417 *
 418 *      The packet type might still be in use by receivers
 419 *      and must not be freed until after all the CPU's have gone
 420 *      through a quiescent state.
 421 */
 422void __dev_remove_pack(struct packet_type *pt)
 423{
 424        struct list_head *head = ptype_head(pt);
 425        struct packet_type *pt1;
 426
 427        spin_lock(&ptype_lock);
 428
 429        list_for_each_entry(pt1, head, list) {
 430                if (pt == pt1) {
 431                        list_del_rcu(&pt->list);
 432                        goto out;
 433                }
 434        }
 435
 436        printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 437out:
 438        spin_unlock(&ptype_lock);
 439}
 440EXPORT_SYMBOL(__dev_remove_pack);
 441
 442/**
 443 *      dev_remove_pack  - remove packet handler
 444 *      @pt: packet type declaration
 445 *
 446 *      Remove a protocol handler that was previously added to the kernel
 447 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 448 *      from the kernel lists and can be freed or reused once this function
 449 *      returns.
 450 *
 451 *      This call sleeps to guarantee that no CPU is looking at the packet
 452 *      type after return.
 453 */
 454void dev_remove_pack(struct packet_type *pt)
 455{
 456        __dev_remove_pack(pt);
 457
 458        synchronize_net();
 459}
 460EXPORT_SYMBOL(dev_remove_pack);
 461
 462/******************************************************************************
 463
 464                      Device Boot-time Settings Routines
 465
 466*******************************************************************************/
 467
 468/* Boot time configuration table */
 469static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 470
 471/**
 472 *      netdev_boot_setup_add   - add new setup entry
 473 *      @name: name of the device
 474 *      @map: configured settings for the device
 475 *
 476 *      Adds new setup entry to the dev_boot_setup list.  The function
 477 *      returns 0 on error and 1 on success.  This is a generic routine to
 478 *      all netdevices.
 479 */
 480static int netdev_boot_setup_add(char *name, struct ifmap *map)
 481{
 482        struct netdev_boot_setup *s;
 483        int i;
 484
 485        s = dev_boot_setup;
 486        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 487                if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 488                        memset(s[i].name, 0, sizeof(s[i].name));
 489                        strlcpy(s[i].name, name, IFNAMSIZ);
 490                        memcpy(&s[i].map, map, sizeof(s[i].map));
 491                        break;
 492                }
 493        }
 494
 495        return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 496}
 497
 498/**
 499 *      netdev_boot_setup_check - check boot time settings
 500 *      @dev: the netdevice
 501 *
 502 *      Check boot time settings for the device.
 503 *      The found settings are set for the device to be used
 504 *      later in the device probing.
 505 *      Returns 0 if no settings found, 1 if they are.
 506 */
 507int netdev_boot_setup_check(struct net_device *dev)
 508{
 509        struct netdev_boot_setup *s = dev_boot_setup;
 510        int i;
 511
 512        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 513                if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 514                    !strcmp(dev->name, s[i].name)) {
 515                        dev->irq        = s[i].map.irq;
 516                        dev->base_addr  = s[i].map.base_addr;
 517                        dev->mem_start  = s[i].map.mem_start;
 518                        dev->mem_end    = s[i].map.mem_end;
 519                        return 1;
 520                }
 521        }
 522        return 0;
 523}
 524EXPORT_SYMBOL(netdev_boot_setup_check);
 525
 526
 527/**
 528 *      netdev_boot_base        - get address from boot time settings
 529 *      @prefix: prefix for network device
 530 *      @unit: id for network device
 531 *
 532 *      Check boot time settings for the base address of device.
 533 *      The found settings are set for the device to be used
 534 *      later in the device probing.
 535 *      Returns 0 if no settings found.
 536 */
 537unsigned long netdev_boot_base(const char *prefix, int unit)
 538{
 539        const struct netdev_boot_setup *s = dev_boot_setup;
 540        char name[IFNAMSIZ];
 541        int i;
 542
 543        sprintf(name, "%s%d", prefix, unit);
 544
 545        /*
 546         * If device already registered then return base of 1
 547         * to indicate not to probe for this interface
 548         */
 549        if (__dev_get_by_name(&init_net, name))
 550                return 1;
 551
 552        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 553                if (!strcmp(name, s[i].name))
 554                        return s[i].map.base_addr;
 555        return 0;
 556}
 557
 558/*
 559 * Saves at boot time configured settings for any netdevice.
 560 */
 561int __init netdev_boot_setup(char *str)
 562{
 563        int ints[5];
 564        struct ifmap map;
 565
 566        str = get_options(str, ARRAY_SIZE(ints), ints);
 567        if (!str || !*str)
 568                return 0;
 569
 570        /* Save settings */
 571        memset(&map, 0, sizeof(map));
 572        if (ints[0] > 0)
 573                map.irq = ints[1];
 574        if (ints[0] > 1)
 575                map.base_addr = ints[2];
 576        if (ints[0] > 2)
 577                map.mem_start = ints[3];
 578        if (ints[0] > 3)
 579                map.mem_end = ints[4];
 580
 581        /* Add new entry to the list */
 582        return netdev_boot_setup_add(str, &map);
 583}
 584
 585__setup("netdev=", netdev_boot_setup);
 586
 587/*******************************************************************************
 588
 589                            Device Interface Subroutines
 590
 591*******************************************************************************/
 592
 593/**
 594 *      __dev_get_by_name       - find a device by its name
 595 *      @net: the applicable net namespace
 596 *      @name: name to find
 597 *
 598 *      Find an interface by name. Must be called under RTNL semaphore
 599 *      or @dev_base_lock. If the name is found a pointer to the device
 600 *      is returned. If the name is not found then %NULL is returned. The
 601 *      reference counters are not incremented so the caller must be
 602 *      careful with locks.
 603 */
 604
 605struct net_device *__dev_get_by_name(struct net *net, const char *name)
 606{
 607        struct hlist_node *p;
 608        struct net_device *dev;
 609        struct hlist_head *head = dev_name_hash(net, name);
 610
 611        hlist_for_each_entry(dev, p, head, name_hlist)
 612                if (!strncmp(dev->name, name, IFNAMSIZ))
 613                        return dev;
 614
 615        return NULL;
 616}
 617EXPORT_SYMBOL(__dev_get_by_name);
 618
 619/**
 620 *      dev_get_by_name_rcu     - find a device by its name
 621 *      @net: the applicable net namespace
 622 *      @name: name to find
 623 *
 624 *      Find an interface by name.
 625 *      If the name is found a pointer to the device is returned.
 626 *      If the name is not found then %NULL is returned.
 627 *      The reference counters are not incremented so the caller must be
 628 *      careful with locks. The caller must hold RCU lock.
 629 */
 630
 631struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 632{
 633        struct hlist_node *p;
 634        struct net_device *dev;
 635        struct hlist_head *head = dev_name_hash(net, name);
 636
 637        hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 638                if (!strncmp(dev->name, name, IFNAMSIZ))
 639                        return dev;
 640
 641        return NULL;
 642}
 643EXPORT_SYMBOL(dev_get_by_name_rcu);
 644
 645/**
 646 *      dev_get_by_name         - find a device by its name
 647 *      @net: the applicable net namespace
 648 *      @name: name to find
 649 *
 650 *      Find an interface by name. This can be called from any
 651 *      context and does its own locking. The returned handle has
 652 *      the usage count incremented and the caller must use dev_put() to
 653 *      release it when it is no longer needed. %NULL is returned if no
 654 *      matching device is found.
 655 */
 656
 657struct net_device *dev_get_by_name(struct net *net, const char *name)
 658{
 659        struct net_device *dev;
 660
 661        rcu_read_lock();
 662        dev = dev_get_by_name_rcu(net, name);
 663        if (dev)
 664                dev_hold(dev);
 665        rcu_read_unlock();
 666        return dev;
 667}
 668EXPORT_SYMBOL(dev_get_by_name);
 669
 670/**
 671 *      __dev_get_by_index - find a device by its ifindex
 672 *      @net: the applicable net namespace
 673 *      @ifindex: index of device
 674 *
 675 *      Search for an interface by index. Returns %NULL if the device
 676 *      is not found or a pointer to the device. The device has not
 677 *      had its reference counter increased so the caller must be careful
 678 *      about locking. The caller must hold either the RTNL semaphore
 679 *      or @dev_base_lock.
 680 */
 681
 682struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 683{
 684        struct hlist_node *p;
 685        struct net_device *dev;
 686        struct hlist_head *head = dev_index_hash(net, ifindex);
 687
 688        hlist_for_each_entry(dev, p, head, index_hlist)
 689                if (dev->ifindex == ifindex)
 690                        return dev;
 691
 692        return NULL;
 693}
 694EXPORT_SYMBOL(__dev_get_by_index);
 695
 696/**
 697 *      dev_get_by_index_rcu - find a device by its ifindex
 698 *      @net: the applicable net namespace
 699 *      @ifindex: index of device
 700 *
 701 *      Search for an interface by index. Returns %NULL if the device
 702 *      is not found or a pointer to the device. The device has not
 703 *      had its reference counter increased so the caller must be careful
 704 *      about locking. The caller must hold RCU lock.
 705 */
 706
 707struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 708{
 709        struct hlist_node *p;
 710        struct net_device *dev;
 711        struct hlist_head *head = dev_index_hash(net, ifindex);
 712
 713        hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 714                if (dev->ifindex == ifindex)
 715                        return dev;
 716
 717        return NULL;
 718}
 719EXPORT_SYMBOL(dev_get_by_index_rcu);
 720
 721
 722/**
 723 *      dev_get_by_index - find a device by its ifindex
 724 *      @net: the applicable net namespace
 725 *      @ifindex: index of device
 726 *
 727 *      Search for an interface by index. Returns NULL if the device
 728 *      is not found or a pointer to the device. The device returned has
 729 *      had a reference added and the pointer is safe until the user calls
 730 *      dev_put to indicate they have finished with it.
 731 */
 732
 733struct net_device *dev_get_by_index(struct net *net, int ifindex)
 734{
 735        struct net_device *dev;
 736
 737        rcu_read_lock();
 738        dev = dev_get_by_index_rcu(net, ifindex);
 739        if (dev)
 740                dev_hold(dev);
 741        rcu_read_unlock();
 742        return dev;
 743}
 744EXPORT_SYMBOL(dev_get_by_index);
 745
 746/**
 747 *      dev_getbyhwaddr_rcu - find a device by its hardware address
 748 *      @net: the applicable net namespace
 749 *      @type: media type of device
 750 *      @ha: hardware address
 751 *
 752 *      Search for an interface by MAC address. Returns NULL if the device
 753 *      is not found or a pointer to the device.
 754 *      The caller must hold RCU or RTNL.
 755 *      The returned device has not had its ref count increased
 756 *      and the caller must therefore be careful about locking
 757 *
 758 */
 759
 760struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 761                                       const char *ha)
 762{
 763        struct net_device *dev;
 764
 765        for_each_netdev_rcu(net, dev)
 766                if (dev->type == type &&
 767                    !memcmp(dev->dev_addr, ha, dev->addr_len))
 768                        return dev;
 769
 770        return NULL;
 771}
 772EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 773
 774struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 775{
 776        struct net_device *dev;
 777
 778        ASSERT_RTNL();
 779        for_each_netdev(net, dev)
 780                if (dev->type == type)
 781                        return dev;
 782
 783        return NULL;
 784}
 785EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 786
 787struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 788{
 789        struct net_device *dev, *ret = NULL;
 790
 791        rcu_read_lock();
 792        for_each_netdev_rcu(net, dev)
 793                if (dev->type == type) {
 794                        dev_hold(dev);
 795                        ret = dev;
 796                        break;
 797                }
 798        rcu_read_unlock();
 799        return ret;
 800}
 801EXPORT_SYMBOL(dev_getfirstbyhwtype);
 802
 803/**
 804 *      dev_get_by_flags_rcu - find any device with given flags
 805 *      @net: the applicable net namespace
 806 *      @if_flags: IFF_* values
 807 *      @mask: bitmask of bits in if_flags to check
 808 *
 809 *      Search for any interface with the given flags. Returns NULL if a device
 810 *      is not found or a pointer to the device. Must be called inside
 811 *      rcu_read_lock(), and result refcount is unchanged.
 812 */
 813
 814struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 815                                    unsigned short mask)
 816{
 817        struct net_device *dev, *ret;
 818
 819        ret = NULL;
 820        for_each_netdev_rcu(net, dev) {
 821                if (((dev->flags ^ if_flags) & mask) == 0) {
 822                        ret = dev;
 823                        break;
 824                }
 825        }
 826        return ret;
 827}
 828EXPORT_SYMBOL(dev_get_by_flags_rcu);
 829
 830/**
 831 *      dev_valid_name - check if name is okay for network device
 832 *      @name: name string
 833 *
 834 *      Network device names need to be valid file names to
 835 *      to allow sysfs to work.  We also disallow any kind of
 836 *      whitespace.
 837 */
 838int dev_valid_name(const char *name)
 839{
 840        if (*name == '\0')
 841                return 0;
 842        if (strlen(name) >= IFNAMSIZ)
 843                return 0;
 844        if (!strcmp(name, ".") || !strcmp(name, ".."))
 845                return 0;
 846
 847        while (*name) {
 848                if (*name == '/' || isspace(*name))
 849                        return 0;
 850                name++;
 851        }
 852        return 1;
 853}
 854EXPORT_SYMBOL(dev_valid_name);
 855
 856/**
 857 *      __dev_alloc_name - allocate a name for a device
 858 *      @net: network namespace to allocate the device name in
 859 *      @name: name format string
 860 *      @buf:  scratch buffer and result name string
 861 *
 862 *      Passed a format string - eg "lt%d" it will try and find a suitable
 863 *      id. It scans list of devices to build up a free map, then chooses
 864 *      the first empty slot. The caller must hold the dev_base or rtnl lock
 865 *      while allocating the name and adding the device in order to avoid
 866 *      duplicates.
 867 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 868 *      Returns the number of the unit assigned or a negative errno code.
 869 */
 870
 871static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 872{
 873        int i = 0;
 874        const char *p;
 875        const int max_netdevices = 8*PAGE_SIZE;
 876        unsigned long *inuse;
 877        struct net_device *d;
 878
 879        p = strnchr(name, IFNAMSIZ-1, '%');
 880        if (p) {
 881                /*
 882                 * Verify the string as this thing may have come from
 883                 * the user.  There must be either one "%d" and no other "%"
 884                 * characters.
 885                 */
 886                if (p[1] != 'd' || strchr(p + 2, '%'))
 887                        return -EINVAL;
 888
 889                /* Use one page as a bit array of possible slots */
 890                inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 891                if (!inuse)
 892                        return -ENOMEM;
 893
 894                for_each_netdev(net, d) {
 895                        if (!sscanf(d->name, name, &i))
 896                                continue;
 897                        if (i < 0 || i >= max_netdevices)
 898                                continue;
 899
 900                        /*  avoid cases where sscanf is not exact inverse of printf */
 901                        snprintf(buf, IFNAMSIZ, name, i);
 902                        if (!strncmp(buf, d->name, IFNAMSIZ))
 903                                set_bit(i, inuse);
 904                }
 905
 906                i = find_first_zero_bit(inuse, max_netdevices);
 907                free_page((unsigned long) inuse);
 908        }
 909
 910        if (buf != name)
 911                snprintf(buf, IFNAMSIZ, name, i);
 912        if (!__dev_get_by_name(net, buf))
 913                return i;
 914
 915        /* It is possible to run out of possible slots
 916         * when the name is long and there isn't enough space left
 917         * for the digits, or if all bits are used.
 918         */
 919        return -ENFILE;
 920}
 921
 922/**
 923 *      dev_alloc_name - allocate a name for a device
 924 *      @dev: device
 925 *      @name: name format string
 926 *
 927 *      Passed a format string - eg "lt%d" it will try and find a suitable
 928 *      id. It scans list of devices to build up a free map, then chooses
 929 *      the first empty slot. The caller must hold the dev_base or rtnl lock
 930 *      while allocating the name and adding the device in order to avoid
 931 *      duplicates.
 932 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 933 *      Returns the number of the unit assigned or a negative errno code.
 934 */
 935
 936int dev_alloc_name(struct net_device *dev, const char *name)
 937{
 938        char buf[IFNAMSIZ];
 939        struct net *net;
 940        int ret;
 941
 942        BUG_ON(!dev_net(dev));
 943        net = dev_net(dev);
 944        ret = __dev_alloc_name(net, name, buf);
 945        if (ret >= 0)
 946                strlcpy(dev->name, buf, IFNAMSIZ);
 947        return ret;
 948}
 949EXPORT_SYMBOL(dev_alloc_name);
 950
 951static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
 952{
 953        struct net *net;
 954
 955        BUG_ON(!dev_net(dev));
 956        net = dev_net(dev);
 957
 958        if (!dev_valid_name(name))
 959                return -EINVAL;
 960
 961        if (fmt && strchr(name, '%'))
 962                return dev_alloc_name(dev, name);
 963        else if (__dev_get_by_name(net, name))
 964                return -EEXIST;
 965        else if (dev->name != name)
 966                strlcpy(dev->name, name, IFNAMSIZ);
 967
 968        return 0;
 969}
 970
 971/**
 972 *      dev_change_name - change name of a device
 973 *      @dev: device
 974 *      @newname: name (or format string) must be at least IFNAMSIZ
 975 *
 976 *      Change name of a device, can pass format strings "eth%d".
 977 *      for wildcarding.
 978 */
 979int dev_change_name(struct net_device *dev, const char *newname)
 980{
 981        char oldname[IFNAMSIZ];
 982        int err = 0;
 983        int ret;
 984        struct net *net;
 985
 986        ASSERT_RTNL();
 987        BUG_ON(!dev_net(dev));
 988
 989        net = dev_net(dev);
 990        if (dev->flags & IFF_UP)
 991                return -EBUSY;
 992
 993        if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 994                return 0;
 995
 996        memcpy(oldname, dev->name, IFNAMSIZ);
 997
 998        err = dev_get_valid_name(dev, newname, 1);
 999        if (err < 0)
1000                return err;
1001
1002rollback:
1003        ret = device_rename(&dev->dev, dev->name);
1004        if (ret) {
1005                memcpy(dev->name, oldname, IFNAMSIZ);
1006                return ret;
1007        }
1008
1009        write_lock_bh(&dev_base_lock);
1010        hlist_del(&dev->name_hlist);
1011        write_unlock_bh(&dev_base_lock);
1012
1013        synchronize_rcu();
1014
1015        write_lock_bh(&dev_base_lock);
1016        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1017        write_unlock_bh(&dev_base_lock);
1018
1019        ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1020        ret = notifier_to_errno(ret);
1021
1022        if (ret) {
1023                /* err >= 0 after dev_alloc_name() or stores the first errno */
1024                if (err >= 0) {
1025                        err = ret;
1026                        memcpy(dev->name, oldname, IFNAMSIZ);
1027                        goto rollback;
1028                } else {
1029                        printk(KERN_ERR
1030                               "%s: name change rollback failed: %d.\n",
1031                               dev->name, ret);
1032                }
1033        }
1034
1035        return err;
1036}
1037
1038/**
1039 *      dev_set_alias - change ifalias of a device
1040 *      @dev: device
1041 *      @alias: name up to IFALIASZ
1042 *      @len: limit of bytes to copy from info
1043 *
1044 *      Set ifalias for a device,
1045 */
1046int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1047{
1048        ASSERT_RTNL();
1049
1050        if (len >= IFALIASZ)
1051                return -EINVAL;
1052
1053        if (!len) {
1054                if (dev->ifalias) {
1055                        kfree(dev->ifalias);
1056                        dev->ifalias = NULL;
1057                }
1058                return 0;
1059        }
1060
1061        dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1062        if (!dev->ifalias)
1063                return -ENOMEM;
1064
1065        strlcpy(dev->ifalias, alias, len+1);
1066        return len;
1067}
1068
1069
1070/**
1071 *      netdev_features_change - device changes features
1072 *      @dev: device to cause notification
1073 *
1074 *      Called to indicate a device has changed features.
1075 */
1076void netdev_features_change(struct net_device *dev)
1077{
1078        call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1079}
1080EXPORT_SYMBOL(netdev_features_change);
1081
1082/**
1083 *      netdev_state_change - device changes state
1084 *      @dev: device to cause notification
1085 *
1086 *      Called to indicate a device has changed state. This function calls
1087 *      the notifier chains for netdev_chain and sends a NEWLINK message
1088 *      to the routing socket.
1089 */
1090void netdev_state_change(struct net_device *dev)
1091{
1092        if (dev->flags & IFF_UP) {
1093                call_netdevice_notifiers(NETDEV_CHANGE, dev);
1094                rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1095        }
1096}
1097EXPORT_SYMBOL(netdev_state_change);
1098
1099int netdev_bonding_change(struct net_device *dev, unsigned long event)
1100{
1101        return call_netdevice_notifiers(event, dev);
1102}
1103EXPORT_SYMBOL(netdev_bonding_change);
1104
1105/**
1106 *      dev_load        - load a network module
1107 *      @net: the applicable net namespace
1108 *      @name: name of interface
1109 *
1110 *      If a network interface is not present and the process has suitable
1111 *      privileges this function loads the module. If module loading is not
1112 *      available in this kernel then it becomes a nop.
1113 */
1114
1115void dev_load(struct net *net, const char *name)
1116{
1117        struct net_device *dev;
1118        int no_module;
1119
1120        rcu_read_lock();
1121        dev = dev_get_by_name_rcu(net, name);
1122        rcu_read_unlock();
1123
1124        no_module = !dev;
1125        if (no_module && capable(CAP_NET_ADMIN))
1126                no_module = request_module("netdev-%s", name);
1127        if (no_module && capable(CAP_SYS_MODULE)) {
1128                if (!request_module("%s", name))
1129                        pr_err("Loading kernel module for a network device "
1130"with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1131"instead\n", name);
1132        }
1133}
1134EXPORT_SYMBOL(dev_load);
1135
1136static int __dev_open(struct net_device *dev)
1137{
1138        const struct net_device_ops *ops = dev->netdev_ops;
1139        int ret;
1140
1141        ASSERT_RTNL();
1142
1143        if (!netif_device_present(dev))
1144                return -ENODEV;
1145
1146        ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1147        ret = notifier_to_errno(ret);
1148        if (ret)
1149                return ret;
1150
1151        set_bit(__LINK_STATE_START, &dev->state);
1152
1153        if (ops->ndo_validate_addr)
1154                ret = ops->ndo_validate_addr(dev);
1155
1156        if (!ret && ops->ndo_open)
1157                ret = ops->ndo_open(dev);
1158
1159        if (ret)
1160                clear_bit(__LINK_STATE_START, &dev->state);
1161        else {
1162                dev->flags |= IFF_UP;
1163                net_dmaengine_get();
1164                dev_set_rx_mode(dev);
1165                dev_activate(dev);
1166        }
1167
1168        return ret;
1169}
1170
1171/**
1172 *      dev_open        - prepare an interface for use.
1173 *      @dev:   device to open
1174 *
1175 *      Takes a device from down to up state. The device's private open
1176 *      function is invoked and then the multicast lists are loaded. Finally
1177 *      the device is moved into the up state and a %NETDEV_UP message is
1178 *      sent to the netdev notifier chain.
1179 *
1180 *      Calling this function on an active interface is a nop. On a failure
1181 *      a negative errno code is returned.
1182 */
1183int dev_open(struct net_device *dev)
1184{
1185        int ret;
1186
1187        if (dev->flags & IFF_UP)
1188                return 0;
1189
1190        ret = __dev_open(dev);
1191        if (ret < 0)
1192                return ret;
1193
1194        rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1195        call_netdevice_notifiers(NETDEV_UP, dev);
1196
1197        return ret;
1198}
1199EXPORT_SYMBOL(dev_open);
1200
1201static int __dev_close_many(struct list_head *head)
1202{
1203        struct net_device *dev;
1204
1205        ASSERT_RTNL();
1206        might_sleep();
1207
1208        list_for_each_entry(dev, head, unreg_list) {
1209                call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1210
1211                clear_bit(__LINK_STATE_START, &dev->state);
1212
1213                /* Synchronize to scheduled poll. We cannot touch poll list, it
1214                 * can be even on different cpu. So just clear netif_running().
1215                 *
1216                 * dev->stop() will invoke napi_disable() on all of it's
1217                 * napi_struct instances on this device.
1218                 */
1219                smp_mb__after_clear_bit(); /* Commit netif_running(). */
1220        }
1221
1222        dev_deactivate_many(head);
1223
1224        list_for_each_entry(dev, head, unreg_list) {
1225                const struct net_device_ops *ops = dev->netdev_ops;
1226
1227                /*
1228                 *      Call the device specific close. This cannot fail.
1229                 *      Only if device is UP
1230                 *
1231                 *      We allow it to be called even after a DETACH hot-plug
1232                 *      event.
1233                 */
1234                if (ops->ndo_stop)
1235                        ops->ndo_stop(dev);
1236
1237                dev->flags &= ~IFF_UP;
1238                net_dmaengine_put();
1239        }
1240
1241        return 0;
1242}
1243
1244static int __dev_close(struct net_device *dev)
1245{
1246        int retval;
1247        LIST_HEAD(single);
1248
1249        list_add(&dev->unreg_list, &single);
1250        retval = __dev_close_many(&single);
1251        list_del(&single);
1252        return retval;
1253}
1254
1255static int dev_close_many(struct list_head *head)
1256{
1257        struct net_device *dev, *tmp;
1258        LIST_HEAD(tmp_list);
1259
1260        list_for_each_entry_safe(dev, tmp, head, unreg_list)
1261                if (!(dev->flags & IFF_UP))
1262                        list_move(&dev->unreg_list, &tmp_list);
1263
1264        __dev_close_many(head);
1265
1266        list_for_each_entry(dev, head, unreg_list) {
1267                rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1268                call_netdevice_notifiers(NETDEV_DOWN, dev);
1269        }
1270
1271        /* rollback_registered_many needs the complete original list */
1272        list_splice(&tmp_list, head);
1273        return 0;
1274}
1275
1276/**
1277 *      dev_close - shutdown an interface.
1278 *      @dev: device to shutdown
1279 *
1280 *      This function moves an active device into down state. A
1281 *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1282 *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1283 *      chain.
1284 */
1285int dev_close(struct net_device *dev)
1286{
1287        if (dev->flags & IFF_UP) {
1288                LIST_HEAD(single);
1289
1290                list_add(&dev->unreg_list, &single);
1291                dev_close_many(&single);
1292                list_del(&single);
1293        }
1294        return 0;
1295}
1296EXPORT_SYMBOL(dev_close);
1297
1298
1299/**
1300 *      dev_disable_lro - disable Large Receive Offload on a device
1301 *      @dev: device
1302 *
1303 *      Disable Large Receive Offload (LRO) on a net device.  Must be
1304 *      called under RTNL.  This is needed if received packets may be
1305 *      forwarded to another interface.
1306 */
1307void dev_disable_lro(struct net_device *dev)
1308{
1309        u32 flags;
1310
1311        if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1312                flags = dev->ethtool_ops->get_flags(dev);
1313        else
1314                flags = ethtool_op_get_flags(dev);
1315
1316        if (!(flags & ETH_FLAG_LRO))
1317                return;
1318
1319        __ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1320        WARN_ON(dev->features & NETIF_F_LRO);
1321}
1322EXPORT_SYMBOL(dev_disable_lro);
1323
1324
1325static int dev_boot_phase = 1;
1326
1327/**
1328 *      register_netdevice_notifier - register a network notifier block
1329 *      @nb: notifier
1330 *
1331 *      Register a notifier to be called when network device events occur.
1332 *      The notifier passed is linked into the kernel structures and must
1333 *      not be reused until it has been unregistered. A negative errno code
1334 *      is returned on a failure.
1335 *
1336 *      When registered all registration and up events are replayed
1337 *      to the new notifier to allow device to have a race free
1338 *      view of the network device list.
1339 */
1340
1341int register_netdevice_notifier(struct notifier_block *nb)
1342{
1343        struct net_device *dev;
1344        struct net_device *last;
1345        struct net *net;
1346        int err;
1347
1348        rtnl_lock();
1349        err = raw_notifier_chain_register(&netdev_chain, nb);
1350        if (err)
1351                goto unlock;
1352        if (dev_boot_phase)
1353                goto unlock;
1354        for_each_net(net) {
1355                for_each_netdev(net, dev) {
1356                        err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1357                        err = notifier_to_errno(err);
1358                        if (err)
1359                                goto rollback;
1360
1361                        if (!(dev->flags & IFF_UP))
1362                                continue;
1363
1364                        nb->notifier_call(nb, NETDEV_UP, dev);
1365                }
1366        }
1367
1368unlock:
1369        rtnl_unlock();
1370        return err;
1371
1372rollback:
1373        last = dev;
1374        for_each_net(net) {
1375                for_each_netdev(net, dev) {
1376                        if (dev == last)
1377                                break;
1378
1379                        if (dev->flags & IFF_UP) {
1380                                nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1381                                nb->notifier_call(nb, NETDEV_DOWN, dev);
1382                        }
1383                        nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1384                        nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1385                }
1386        }
1387
1388        raw_notifier_chain_unregister(&netdev_chain, nb);
1389        goto unlock;
1390}
1391EXPORT_SYMBOL(register_netdevice_notifier);
1392
1393/**
1394 *      unregister_netdevice_notifier - unregister a network notifier block
1395 *      @nb: notifier
1396 *
1397 *      Unregister a notifier previously registered by
1398 *      register_netdevice_notifier(). The notifier is unlinked into the
1399 *      kernel structures and may then be reused. A negative errno code
1400 *      is returned on a failure.
1401 */
1402
1403int unregister_netdevice_notifier(struct notifier_block *nb)
1404{
1405        int err;
1406
1407        rtnl_lock();
1408        err = raw_notifier_chain_unregister(&netdev_chain, nb);
1409        rtnl_unlock();
1410        return err;
1411}
1412EXPORT_SYMBOL(unregister_netdevice_notifier);
1413
1414/**
1415 *      call_netdevice_notifiers - call all network notifier blocks
1416 *      @val: value passed unmodified to notifier function
1417 *      @dev: net_device pointer passed unmodified to notifier function
1418 *
1419 *      Call all network notifier blocks.  Parameters and return value
1420 *      are as for raw_notifier_call_chain().
1421 */
1422
1423int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1424{
1425        ASSERT_RTNL();
1426        return raw_notifier_call_chain(&netdev_chain, val, dev);
1427}
1428EXPORT_SYMBOL(call_netdevice_notifiers);
1429
1430/* When > 0 there are consumers of rx skb time stamps */
1431static atomic_t netstamp_needed = ATOMIC_INIT(0);
1432
1433void net_enable_timestamp(void)
1434{
1435        atomic_inc(&netstamp_needed);
1436}
1437EXPORT_SYMBOL(net_enable_timestamp);
1438
1439void net_disable_timestamp(void)
1440{
1441        atomic_dec(&netstamp_needed);
1442}
1443EXPORT_SYMBOL(net_disable_timestamp);
1444
1445static inline void net_timestamp_set(struct sk_buff *skb)
1446{
1447        if (atomic_read(&netstamp_needed))
1448                __net_timestamp(skb);
1449        else
1450                skb->tstamp.tv64 = 0;
1451}
1452
1453static inline void net_timestamp_check(struct sk_buff *skb)
1454{
1455        if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1456                __net_timestamp(skb);
1457}
1458
1459static inline bool is_skb_forwardable(struct net_device *dev,
1460                                      struct sk_buff *skb)
1461{
1462        unsigned int len;
1463
1464        if (!(dev->flags & IFF_UP))
1465                return false;
1466
1467        len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1468        if (skb->len <= len)
1469                return true;
1470
1471        /* if TSO is enabled, we don't care about the length as the packet
1472         * could be forwarded without being segmented before
1473         */
1474        if (skb_is_gso(skb))
1475                return true;
1476
1477        return false;
1478}
1479
1480/**
1481 * dev_forward_skb - loopback an skb to another netif
1482 *
1483 * @dev: destination network device
1484 * @skb: buffer to forward
1485 *
1486 * return values:
1487 *      NET_RX_SUCCESS  (no congestion)
1488 *      NET_RX_DROP     (packet was dropped, but freed)
1489 *
1490 * dev_forward_skb can be used for injecting an skb from the
1491 * start_xmit function of one device into the receive queue
1492 * of another device.
1493 *
1494 * The receiving device may be in another namespace, so
1495 * we have to clear all information in the skb that could
1496 * impact namespace isolation.
1497 */
1498int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1499{
1500        skb_orphan(skb);
1501        nf_reset(skb);
1502
1503        if (unlikely(!is_skb_forwardable(dev, skb))) {
1504                atomic_long_inc(&dev->rx_dropped);
1505                kfree_skb(skb);
1506                return NET_RX_DROP;
1507        }
1508        skb_set_dev(skb, dev);
1509        skb->tstamp.tv64 = 0;
1510        skb->pkt_type = PACKET_HOST;
1511        skb->protocol = eth_type_trans(skb, dev);
1512        return netif_rx(skb);
1513}
1514EXPORT_SYMBOL_GPL(dev_forward_skb);
1515
1516static inline int deliver_skb(struct sk_buff *skb,
1517                              struct packet_type *pt_prev,
1518                              struct net_device *orig_dev)
1519{
1520        atomic_inc(&skb->users);
1521        return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1522}
1523
1524/*
1525 *      Support routine. Sends outgoing frames to any network
1526 *      taps currently in use.
1527 */
1528
1529static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1530{
1531        struct packet_type *ptype;
1532        struct sk_buff *skb2 = NULL;
1533        struct packet_type *pt_prev = NULL;
1534
1535        rcu_read_lock();
1536        list_for_each_entry_rcu(ptype, &ptype_all, list) {
1537                /* Never send packets back to the socket
1538                 * they originated from - MvS (miquels@drinkel.ow.org)
1539                 */
1540                if ((ptype->dev == dev || !ptype->dev) &&
1541                    (ptype->af_packet_priv == NULL ||
1542                     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1543                        if (pt_prev) {
1544                                deliver_skb(skb2, pt_prev, skb->dev);
1545                                pt_prev = ptype;
1546                                continue;
1547                        }
1548
1549                        skb2 = skb_clone(skb, GFP_ATOMIC);
1550                        if (!skb2)
1551                                break;
1552
1553                        net_timestamp_set(skb2);
1554
1555                        /* skb->nh should be correctly
1556                           set by sender, so that the second statement is
1557                           just protection against buggy protocols.
1558                         */
1559                        skb_reset_mac_header(skb2);
1560
1561                        if (skb_network_header(skb2) < skb2->data ||
1562                            skb2->network_header > skb2->tail) {
1563                                if (net_ratelimit())
1564                                        printk(KERN_CRIT "protocol %04x is "
1565                                               "buggy, dev %s\n",
1566                                               ntohs(skb2->protocol),
1567                                               dev->name);
1568                                skb_reset_network_header(skb2);
1569                        }
1570
1571                        skb2->transport_header = skb2->network_header;
1572                        skb2->pkt_type = PACKET_OUTGOING;
1573                        pt_prev = ptype;
1574                }
1575        }
1576        if (pt_prev)
1577                pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1578        rcu_read_unlock();
1579}
1580
1581/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1582 * @dev: Network device
1583 * @txq: number of queues available
1584 *
1585 * If real_num_tx_queues is changed the tc mappings may no longer be
1586 * valid. To resolve this verify the tc mapping remains valid and if
1587 * not NULL the mapping. With no priorities mapping to this
1588 * offset/count pair it will no longer be used. In the worst case TC0
1589 * is invalid nothing can be done so disable priority mappings. If is
1590 * expected that drivers will fix this mapping if they can before
1591 * calling netif_set_real_num_tx_queues.
1592 */
1593static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1594{
1595        int i;
1596        struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1597
1598        /* If TC0 is invalidated disable TC mapping */
1599        if (tc->offset + tc->count > txq) {
1600                pr_warning("Number of in use tx queues changed "
1601                           "invalidating tc mappings. Priority "
1602                           "traffic classification disabled!\n");
1603                dev->num_tc = 0;
1604                return;
1605        }
1606
1607        /* Invalidated prio to tc mappings set to TC0 */
1608        for (i = 1; i < TC_BITMASK + 1; i++) {
1609                int q = netdev_get_prio_tc_map(dev, i);
1610
1611                tc = &dev->tc_to_txq[q];
1612                if (tc->offset + tc->count > txq) {
1613                        pr_warning("Number of in use tx queues "
1614                                   "changed. Priority %i to tc "
1615                                   "mapping %i is no longer valid "
1616                                   "setting map to 0\n",
1617                                   i, q);
1618                        netdev_set_prio_tc_map(dev, i, 0);
1619                }
1620        }
1621}
1622
1623/*
1624 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1625 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1626 */
1627int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1628{
1629        int rc;
1630
1631        if (txq < 1 || txq > dev->num_tx_queues)
1632                return -EINVAL;
1633
1634        if (dev->reg_state == NETREG_REGISTERED ||
1635            dev->reg_state == NETREG_UNREGISTERING) {
1636                ASSERT_RTNL();
1637
1638                rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1639                                                  txq);
1640                if (rc)
1641                        return rc;
1642
1643                if (dev->num_tc)
1644                        netif_setup_tc(dev, txq);
1645
1646                if (txq < dev->real_num_tx_queues)
1647                        qdisc_reset_all_tx_gt(dev, txq);
1648        }
1649
1650        dev->real_num_tx_queues = txq;
1651        return 0;
1652}
1653EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1654
1655#ifdef CONFIG_RPS
1656/**
1657 *      netif_set_real_num_rx_queues - set actual number of RX queues used
1658 *      @dev: Network device
1659 *      @rxq: Actual number of RX queues
1660 *
1661 *      This must be called either with the rtnl_lock held or before
1662 *      registration of the net device.  Returns 0 on success, or a
1663 *      negative error code.  If called before registration, it always
1664 *      succeeds.
1665 */
1666int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1667{
1668        int rc;
1669
1670        if (rxq < 1 || rxq > dev->num_rx_queues)
1671                return -EINVAL;
1672
1673        if (dev->reg_state == NETREG_REGISTERED) {
1674                ASSERT_RTNL();
1675
1676                rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1677                                                  rxq);
1678                if (rc)
1679                        return rc;
1680        }
1681
1682        dev->real_num_rx_queues = rxq;
1683        return 0;
1684}
1685EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1686#endif
1687
1688static inline void __netif_reschedule(struct Qdisc *q)
1689{
1690        struct softnet_data *sd;
1691        unsigned long flags;
1692
1693        local_irq_save(flags);
1694        sd = &__get_cpu_var(softnet_data);
1695        q->next_sched = NULL;
1696        *sd->output_queue_tailp = q;
1697        sd->output_queue_tailp = &q->next_sched;
1698        raise_softirq_irqoff(NET_TX_SOFTIRQ);
1699        local_irq_restore(flags);
1700}
1701
1702void __netif_schedule(struct Qdisc *q)
1703{
1704        if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1705                __netif_reschedule(q);
1706}
1707EXPORT_SYMBOL(__netif_schedule);
1708
1709void dev_kfree_skb_irq(struct sk_buff *skb)
1710{
1711        if (atomic_dec_and_test(&skb->users)) {
1712                struct softnet_data *sd;
1713                unsigned long flags;
1714
1715                local_irq_save(flags);
1716                sd = &__get_cpu_var(softnet_data);
1717                skb->next = sd->completion_queue;
1718                sd->completion_queue = skb;
1719                raise_softirq_irqoff(NET_TX_SOFTIRQ);
1720                local_irq_restore(flags);
1721        }
1722}
1723EXPORT_SYMBOL(dev_kfree_skb_irq);
1724
1725void dev_kfree_skb_any(struct sk_buff *skb)
1726{
1727        if (in_irq() || irqs_disabled())
1728                dev_kfree_skb_irq(skb);
1729        else
1730                dev_kfree_skb(skb);
1731}
1732EXPORT_SYMBOL(dev_kfree_skb_any);
1733
1734
1735/**
1736 * netif_device_detach - mark device as removed
1737 * @dev: network device
1738 *
1739 * Mark device as removed from system and therefore no longer available.
1740 */
1741void netif_device_detach(struct net_device *dev)
1742{
1743        if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1744            netif_running(dev)) {
1745                netif_tx_stop_all_queues(dev);
1746        }
1747}
1748EXPORT_SYMBOL(netif_device_detach);
1749
1750/**
1751 * netif_device_attach - mark device as attached
1752 * @dev: network device
1753 *
1754 * Mark device as attached from system and restart if needed.
1755 */
1756void netif_device_attach(struct net_device *dev)
1757{
1758        if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1759            netif_running(dev)) {
1760                netif_tx_wake_all_queues(dev);
1761                __netdev_watchdog_up(dev);
1762        }
1763}
1764EXPORT_SYMBOL(netif_device_attach);
1765
1766/**
1767 * skb_dev_set -- assign a new device to a buffer
1768 * @skb: buffer for the new device
1769 * @dev: network device
1770 *
1771 * If an skb is owned by a device already, we have to reset
1772 * all data private to the namespace a device belongs to
1773 * before assigning it a new device.
1774 */
1775#ifdef CONFIG_NET_NS
1776void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1777{
1778        skb_dst_drop(skb);
1779        if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1780                secpath_reset(skb);
1781                nf_reset(skb);
1782                skb_init_secmark(skb);
1783                skb->mark = 0;
1784                skb->priority = 0;
1785                skb->nf_trace = 0;
1786                skb->ipvs_property = 0;
1787#ifdef CONFIG_NET_SCHED
1788                skb->tc_index = 0;
1789#endif
1790        }
1791        skb->dev = dev;
1792}
1793EXPORT_SYMBOL(skb_set_dev);
1794#endif /* CONFIG_NET_NS */
1795
1796/*
1797 * Invalidate hardware checksum when packet is to be mangled, and
1798 * complete checksum manually on outgoing path.
1799 */
1800int skb_checksum_help(struct sk_buff *skb)
1801{
1802        __wsum csum;
1803        int ret = 0, offset;
1804
1805        if (skb->ip_summed == CHECKSUM_COMPLETE)
1806                goto out_set_summed;
1807
1808        if (unlikely(skb_shinfo(skb)->gso_size)) {
1809                /* Let GSO fix up the checksum. */
1810                goto out_set_summed;
1811        }
1812
1813        offset = skb_checksum_start_offset(skb);
1814        BUG_ON(offset >= skb_headlen(skb));
1815        csum = skb_checksum(skb, offset, skb->len - offset, 0);
1816
1817        offset += skb->csum_offset;
1818        BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1819
1820        if (skb_cloned(skb) &&
1821            !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1822                ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1823                if (ret)
1824                        goto out;
1825        }
1826
1827        *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1828out_set_summed:
1829        skb->ip_summed = CHECKSUM_NONE;
1830out:
1831        return ret;
1832}
1833EXPORT_SYMBOL(skb_checksum_help);
1834
1835/**
1836 *      skb_gso_segment - Perform segmentation on skb.
1837 *      @skb: buffer to segment
1838 *      @features: features for the output path (see dev->features)
1839 *
1840 *      This function segments the given skb and returns a list of segments.
1841 *
1842 *      It may return NULL if the skb requires no segmentation.  This is
1843 *      only possible when GSO is used for verifying header integrity.
1844 */
1845struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1846{
1847        struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1848        struct packet_type *ptype;
1849        __be16 type = skb->protocol;
1850        int vlan_depth = ETH_HLEN;
1851        int err;
1852
1853        while (type == htons(ETH_P_8021Q)) {
1854                struct vlan_hdr *vh;
1855
1856                if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1857                        return ERR_PTR(-EINVAL);
1858
1859                vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1860                type = vh->h_vlan_encapsulated_proto;
1861                vlan_depth += VLAN_HLEN;
1862        }
1863
1864        skb_reset_mac_header(skb);
1865        skb->mac_len = skb->network_header - skb->mac_header;
1866        __skb_pull(skb, skb->mac_len);
1867
1868        if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1869                struct net_device *dev = skb->dev;
1870                struct ethtool_drvinfo info = {};
1871
1872                if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1873                        dev->ethtool_ops->get_drvinfo(dev, &info);
1874
1875                WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1876                     info.driver, dev ? dev->features : 0L,
1877                     skb->sk ? skb->sk->sk_route_caps : 0L,
1878                     skb->len, skb->data_len, skb->ip_summed);
1879
1880                if (skb_header_cloned(skb) &&
1881                    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1882                        return ERR_PTR(err);
1883        }
1884
1885        rcu_read_lock();
1886        list_for_each_entry_rcu(ptype,
1887                        &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1888                if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1889                        if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1890                                err = ptype->gso_send_check(skb);
1891                                segs = ERR_PTR(err);
1892                                if (err || skb_gso_ok(skb, features))
1893                                        break;
1894                                __skb_push(skb, (skb->data -
1895                                                 skb_network_header(skb)));
1896                        }
1897                        segs = ptype->gso_segment(skb, features);
1898                        break;
1899                }
1900        }
1901        rcu_read_unlock();
1902
1903        __skb_push(skb, skb->data - skb_mac_header(skb));
1904
1905        return segs;
1906}
1907EXPORT_SYMBOL(skb_gso_segment);
1908
1909/* Take action when hardware reception checksum errors are detected. */
1910#ifdef CONFIG_BUG
1911void netdev_rx_csum_fault(struct net_device *dev)
1912{
1913        if (net_ratelimit()) {
1914                printk(KERN_ERR "%s: hw csum failure.\n",
1915                        dev ? dev->name : "<unknown>");
1916                dump_stack();
1917        }
1918}
1919EXPORT_SYMBOL(netdev_rx_csum_fault);
1920#endif
1921
1922/* Actually, we should eliminate this check as soon as we know, that:
1923 * 1. IOMMU is present and allows to map all the memory.
1924 * 2. No high memory really exists on this machine.
1925 */
1926
1927static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1928{
1929#ifdef CONFIG_HIGHMEM
1930        int i;
1931        if (!(dev->features & NETIF_F_HIGHDMA)) {
1932                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1933                        if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1934                                return 1;
1935        }
1936
1937        if (PCI_DMA_BUS_IS_PHYS) {
1938                struct device *pdev = dev->dev.parent;
1939
1940                if (!pdev)
1941                        return 0;
1942                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1943                        dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1944                        if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1945                                return 1;
1946                }
1947        }
1948#endif
1949        return 0;
1950}
1951
1952struct dev_gso_cb {
1953        void (*destructor)(struct sk_buff *skb);
1954};
1955
1956#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1957
1958static void dev_gso_skb_destructor(struct sk_buff *skb)
1959{
1960        struct dev_gso_cb *cb;
1961
1962        do {
1963                struct sk_buff *nskb = skb->next;
1964
1965                skb->next = nskb->next;
1966                nskb->next = NULL;
1967                kfree_skb(nskb);
1968        } while (skb->next);
1969
1970        cb = DEV_GSO_CB(skb);
1971        if (cb->destructor)
1972                cb->destructor(skb);
1973}
1974
1975/**
1976 *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1977 *      @skb: buffer to segment
1978 *      @features: device features as applicable to this skb
1979 *
1980 *      This function segments the given skb and stores the list of segments
1981 *      in skb->next.
1982 */
1983static int dev_gso_segment(struct sk_buff *skb, int features)
1984{
1985        struct sk_buff *segs;
1986
1987        segs = skb_gso_segment(skb, features);
1988
1989        /* Verifying header integrity only. */
1990        if (!segs)
1991                return 0;
1992
1993        if (IS_ERR(segs))
1994                return PTR_ERR(segs);
1995
1996        skb->next = segs;
1997        DEV_GSO_CB(skb)->destructor = skb->destructor;
1998        skb->destructor = dev_gso_skb_destructor;
1999
2000        return 0;
2001}
2002
2003/*
2004 * Try to orphan skb early, right before transmission by the device.
2005 * We cannot orphan skb if tx timestamp is requested or the sk-reference
2006 * is needed on driver level for other reasons, e.g. see net/can/raw.c
2007 */
2008static inline void skb_orphan_try(struct sk_buff *skb)
2009{
2010        struct sock *sk = skb->sk;
2011
2012        if (sk && !skb_shinfo(skb)->tx_flags) {
2013                /* skb_tx_hash() wont be able to get sk.
2014                 * We copy sk_hash into skb->rxhash
2015                 */
2016                if (!skb->rxhash)
2017                        skb->rxhash = sk->sk_hash;
2018                skb_orphan(skb);
2019        }
2020}
2021
2022static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2023{
2024        return ((features & NETIF_F_GEN_CSUM) ||
2025                ((features & NETIF_F_V4_CSUM) &&
2026                 protocol == htons(ETH_P_IP)) ||
2027                ((features & NETIF_F_V6_CSUM) &&
2028                 protocol == htons(ETH_P_IPV6)) ||
2029                ((features & NETIF_F_FCOE_CRC) &&
2030                 protocol == htons(ETH_P_FCOE)));
2031}
2032
2033static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2034{
2035        if (!can_checksum_protocol(features, protocol)) {
2036                features &= ~NETIF_F_ALL_CSUM;
2037                features &= ~NETIF_F_SG;
2038        } else if (illegal_highdma(skb->dev, skb)) {
2039                features &= ~NETIF_F_SG;
2040        }
2041
2042        return features;
2043}
2044
2045u32 netif_skb_features(struct sk_buff *skb)
2046{
2047        __be16 protocol = skb->protocol;
2048        u32 features = skb->dev->features;
2049
2050        if (protocol == htons(ETH_P_8021Q)) {
2051                struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2052                protocol = veh->h_vlan_encapsulated_proto;
2053        } else if (!vlan_tx_tag_present(skb)) {
2054                return harmonize_features(skb, protocol, features);
2055        }
2056
2057        features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2058
2059        if (protocol != htons(ETH_P_8021Q)) {
2060                return harmonize_features(skb, protocol, features);
2061        } else {
2062                features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2063                                NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2064                return harmonize_features(skb, protocol, features);
2065        }
2066}
2067EXPORT_SYMBOL(netif_skb_features);
2068
2069/*
2070 * Returns true if either:
2071 *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2072 *      2. skb is fragmented and the device does not support SG, or if
2073 *         at least one of fragments is in highmem and device does not
2074 *         support DMA from it.
2075 */
2076static inline int skb_needs_linearize(struct sk_buff *skb,
2077                                      int features)
2078{
2079        return skb_is_nonlinear(skb) &&
2080                        ((skb_has_frag_list(skb) &&
2081                                !(features & NETIF_F_FRAGLIST)) ||
2082                        (skb_shinfo(skb)->nr_frags &&
2083                                !(features & NETIF_F_SG)));
2084}
2085
2086int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2087                        struct netdev_queue *txq)
2088{
2089        const struct net_device_ops *ops = dev->netdev_ops;
2090        int rc = NETDEV_TX_OK;
2091
2092        if (likely(!skb->next)) {
2093                u32 features;
2094
2095                /*
2096                 * If device doesn't need skb->dst, release it right now while
2097                 * its hot in this cpu cache
2098                 */
2099                if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2100                        skb_dst_drop(skb);
2101
2102                if (!list_empty(&ptype_all))
2103                        dev_queue_xmit_nit(skb, dev);
2104
2105                skb_orphan_try(skb);
2106
2107                features = netif_skb_features(skb);
2108
2109                if (vlan_tx_tag_present(skb) &&
2110                    !(features & NETIF_F_HW_VLAN_TX)) {
2111                        skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2112                        if (unlikely(!skb))
2113                                goto out;
2114
2115                        skb->vlan_tci = 0;
2116                }
2117
2118                if (netif_needs_gso(skb, features)) {
2119                        if (unlikely(dev_gso_segment(skb, features)))
2120                                goto out_kfree_skb;
2121                        if (skb->next)
2122                                goto gso;
2123                } else {
2124                        if (skb_needs_linearize(skb, features) &&
2125                            __skb_linearize(skb))
2126                                goto out_kfree_skb;
2127
2128                        /* If packet is not checksummed and device does not
2129                         * support checksumming for this protocol, complete
2130                         * checksumming here.
2131                         */
2132                        if (skb->ip_summed == CHECKSUM_PARTIAL) {
2133                                skb_set_transport_header(skb,
2134                                        skb_checksum_start_offset(skb));
2135                                if (!(features & NETIF_F_ALL_CSUM) &&
2136                                     skb_checksum_help(skb))
2137                                        goto out_kfree_skb;
2138                        }
2139                }
2140
2141                rc = ops->ndo_start_xmit(skb, dev);
2142                trace_net_dev_xmit(skb, rc);
2143                if (rc == NETDEV_TX_OK)
2144                        txq_trans_update(txq);
2145                return rc;
2146        }
2147
2148gso:
2149        do {
2150                struct sk_buff *nskb = skb->next;
2151
2152                skb->next = nskb->next;
2153                nskb->next = NULL;
2154
2155                /*
2156                 * If device doesn't need nskb->dst, release it right now while
2157                 * its hot in this cpu cache
2158                 */
2159                if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2160                        skb_dst_drop(nskb);
2161
2162                rc = ops->ndo_start_xmit(nskb, dev);
2163                trace_net_dev_xmit(nskb, rc);
2164                if (unlikely(rc != NETDEV_TX_OK)) {
2165                        if (rc & ~NETDEV_TX_MASK)
2166                                goto out_kfree_gso_skb;
2167                        nskb->next = skb->next;
2168                        skb->next = nskb;
2169                        return rc;
2170                }
2171                txq_trans_update(txq);
2172                if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2173                        return NETDEV_TX_BUSY;
2174        } while (skb->next);
2175
2176out_kfree_gso_skb:
2177        if (likely(skb->next == NULL))
2178                skb->destructor = DEV_GSO_CB(skb)->destructor;
2179out_kfree_skb:
2180        kfree_skb(skb);
2181out:
2182        return rc;
2183}
2184
2185static u32 hashrnd __read_mostly;
2186
2187/*
2188 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2189 * to be used as a distribution range.
2190 */
2191u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2192                  unsigned int num_tx_queues)
2193{
2194        u32 hash;
2195        u16 qoffset = 0;
2196        u16 qcount = num_tx_queues;
2197
2198        if (skb_rx_queue_recorded(skb)) {
2199                hash = skb_get_rx_queue(skb);
2200                while (unlikely(hash >= num_tx_queues))
2201                        hash -= num_tx_queues;
2202                return hash;
2203        }
2204
2205        if (dev->num_tc) {
2206                u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2207                qoffset = dev->tc_to_txq[tc].offset;
2208                qcount = dev->tc_to_txq[tc].count;
2209        }
2210
2211        if (skb->sk && skb->sk->sk_hash)
2212                hash = skb->sk->sk_hash;
2213        else
2214                hash = (__force u16) skb->protocol ^ skb->rxhash;
2215        hash = jhash_1word(hash, hashrnd);
2216
2217        return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2218}
2219EXPORT_SYMBOL(__skb_tx_hash);
2220
2221static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2222{
2223        if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2224                if (net_ratelimit()) {
2225                        pr_warning("%s selects TX queue %d, but "
2226                                "real number of TX queues is %d\n",
2227                                dev->name, queue_index, dev->real_num_tx_queues);
2228                }
2229                return 0;
2230        }
2231        return queue_index;
2232}
2233
2234static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2235{
2236#ifdef CONFIG_XPS
2237        struct xps_dev_maps *dev_maps;
2238        struct xps_map *map;
2239        int queue_index = -1;
2240
2241        rcu_read_lock();
2242        dev_maps = rcu_dereference(dev->xps_maps);
2243        if (dev_maps) {
2244                map = rcu_dereference(
2245                    dev_maps->cpu_map[raw_smp_processor_id()]);
2246                if (map) {
2247                        if (map->len == 1)
2248                                queue_index = map->queues[0];
2249                        else {
2250                                u32 hash;
2251                                if (skb->sk && skb->sk->sk_hash)
2252                                        hash = skb->sk->sk_hash;
2253                                else
2254                                        hash = (__force u16) skb->protocol ^
2255                                            skb->rxhash;
2256                                hash = jhash_1word(hash, hashrnd);
2257                                queue_index = map->queues[
2258                                    ((u64)hash * map->len) >> 32];
2259                        }
2260                        if (unlikely(queue_index >= dev->real_num_tx_queues))
2261                                queue_index = -1;
2262                }
2263        }
2264        rcu_read_unlock();
2265
2266        return queue_index;
2267#else
2268        return -1;
2269#endif
2270}
2271
2272static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2273                                        struct sk_buff *skb)
2274{
2275        int queue_index;
2276        const struct net_device_ops *ops = dev->netdev_ops;
2277
2278        if (dev->real_num_tx_queues == 1)
2279                queue_index = 0;
2280        else if (ops->ndo_select_queue) {
2281                queue_index = ops->ndo_select_queue(dev, skb);
2282                queue_index = dev_cap_txqueue(dev, queue_index);
2283        } else {
2284                struct sock *sk = skb->sk;
2285                queue_index = sk_tx_queue_get(sk);
2286
2287                if (queue_index < 0 || skb->ooo_okay ||
2288                    queue_index >= dev->real_num_tx_queues) {
2289                        int old_index = queue_index;
2290
2291                        queue_index = get_xps_queue(dev, skb);
2292                        if (queue_index < 0)
2293                                queue_index = skb_tx_hash(dev, skb);
2294
2295                        if (queue_index != old_index && sk) {
2296                                struct dst_entry *dst =
2297                                    rcu_dereference_check(sk->sk_dst_cache, 1);
2298
2299                                if (dst && skb_dst(skb) == dst)
2300                                        sk_tx_queue_set(sk, queue_index);
2301                        }
2302                }
2303        }
2304
2305        skb_set_queue_mapping(skb, queue_index);
2306        return netdev_get_tx_queue(dev, queue_index);
2307}
2308
2309static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2310                                 struct net_device *dev,
2311                                 struct netdev_queue *txq)
2312{
2313        spinlock_t *root_lock = qdisc_lock(q);
2314        bool contended;
2315        int rc;
2316
2317        qdisc_skb_cb(skb)->pkt_len = skb->len;
2318        qdisc_calculate_pkt_len(skb, q);
2319        /*
2320         * Heuristic to force contended enqueues to serialize on a
2321         * separate lock before trying to get qdisc main lock.
2322         * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2323         * and dequeue packets faster.
2324         */
2325        contended = qdisc_is_running(q);
2326        if (unlikely(contended))
2327                spin_lock(&q->busylock);
2328
2329        spin_lock(root_lock);
2330        if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2331                kfree_skb(skb);
2332                rc = NET_XMIT_DROP;
2333        } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2334                   qdisc_run_begin(q)) {
2335                /*
2336                 * This is a work-conserving queue; there are no old skbs
2337                 * waiting to be sent out; and the qdisc is not running -
2338                 * xmit the skb directly.
2339                 */
2340                if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2341                        skb_dst_force(skb);
2342
2343                qdisc_bstats_update(q, skb);
2344
2345                if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2346                        if (unlikely(contended)) {
2347                                spin_unlock(&q->busylock);
2348                                contended = false;
2349                        }
2350                        __qdisc_run(q);
2351                } else
2352                        qdisc_run_end(q);
2353
2354                rc = NET_XMIT_SUCCESS;
2355        } else {
2356                skb_dst_force(skb);
2357                rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2358                if (qdisc_run_begin(q)) {
2359                        if (unlikely(contended)) {
2360                                spin_unlock(&q->busylock);
2361                                contended = false;
2362                        }
2363                        __qdisc_run(q);
2364                }
2365        }
2366        spin_unlock(root_lock);
2367        if (unlikely(contended))
2368                spin_unlock(&q->busylock);
2369        return rc;
2370}
2371
2372static DEFINE_PER_CPU(int, xmit_recursion);
2373#define RECURSION_LIMIT 10
2374
2375/**
2376 *      dev_queue_xmit - transmit a buffer
2377 *      @skb: buffer to transmit
2378 *
2379 *      Queue a buffer for transmission to a network device. The caller must
2380 *      have set the device and priority and built the buffer before calling
2381 *      this function. The function can be called from an interrupt.
2382 *
2383 *      A negative errno code is returned on a failure. A success does not
2384 *      guarantee the frame will be transmitted as it may be dropped due
2385 *      to congestion or traffic shaping.
2386 *
2387 * -----------------------------------------------------------------------------------
2388 *      I notice this method can also return errors from the queue disciplines,
2389 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2390 *      be positive.
2391 *
2392 *      Regardless of the return value, the skb is consumed, so it is currently
2393 *      difficult to retry a send to this method.  (You can bump the ref count
2394 *      before sending to hold a reference for retry if you are careful.)
2395 *
2396 *      When calling this method, interrupts MUST be enabled.  This is because
2397 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2398 *          --BLG
2399 */
2400int dev_queue_xmit(struct sk_buff *skb)
2401{
2402        struct net_device *dev = skb->dev;
2403        struct netdev_queue *txq;
2404        struct Qdisc *q;
2405        int rc = -ENOMEM;
2406
2407        /* Disable soft irqs for various locks below. Also
2408         * stops preemption for RCU.
2409         */
2410        rcu_read_lock_bh();
2411
2412        txq = dev_pick_tx(dev, skb);
2413        q = rcu_dereference_bh(txq->qdisc);
2414
2415#ifdef CONFIG_NET_CLS_ACT
2416        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2417#endif
2418        trace_net_dev_queue(skb);
2419        if (q->enqueue) {
2420                rc = __dev_xmit_skb(skb, q, dev, txq);
2421                goto out;
2422        }
2423
2424        /* The device has no queue. Common case for software devices:
2425           loopback, all the sorts of tunnels...
2426
2427           Really, it is unlikely that netif_tx_lock protection is necessary
2428           here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2429           counters.)
2430           However, it is possible, that they rely on protection
2431           made by us here.
2432
2433           Check this and shot the lock. It is not prone from deadlocks.
2434           Either shot noqueue qdisc, it is even simpler 8)
2435         */
2436        if (dev->flags & IFF_UP) {
2437                int cpu = smp_processor_id(); /* ok because BHs are off */
2438
2439                if (txq->xmit_lock_owner != cpu) {
2440
2441                        if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2442                                goto recursion_alert;
2443
2444                        HARD_TX_LOCK(dev, txq, cpu);
2445
2446                        if (!netif_tx_queue_stopped(txq)) {
2447                                __this_cpu_inc(xmit_recursion);
2448                                rc = dev_hard_start_xmit(skb, dev, txq);
2449                                __this_cpu_dec(xmit_recursion);
2450                                if (dev_xmit_complete(rc)) {
2451                                        HARD_TX_UNLOCK(dev, txq);
2452                                        goto out;
2453                                }
2454                        }
2455                        HARD_TX_UNLOCK(dev, txq);
2456                        if (net_ratelimit())
2457                                printk(KERN_CRIT "Virtual device %s asks to "
2458                                       "queue packet!\n", dev->name);
2459                } else {
2460                        /* Recursion is detected! It is possible,
2461                         * unfortunately
2462                         */
2463recursion_alert:
2464                        if (net_ratelimit())
2465                                printk(KERN_CRIT "Dead loop on virtual device "
2466                                       "%s, fix it urgently!\n", dev->name);
2467                }
2468        }
2469
2470        rc = -ENETDOWN;
2471        rcu_read_unlock_bh();
2472
2473        kfree_skb(skb);
2474        return rc;
2475out:
2476        rcu_read_unlock_bh();
2477        return rc;
2478}
2479EXPORT_SYMBOL(dev_queue_xmit);
2480
2481
2482/*=======================================================================
2483                        Receiver routines
2484  =======================================================================*/
2485
2486int netdev_max_backlog __read_mostly = 1000;
2487int netdev_tstamp_prequeue __read_mostly = 1;
2488int netdev_budget __read_mostly = 300;
2489int weight_p __read_mostly = 64;            /* old backlog weight */
2490
2491/* Called with irq disabled */
2492static inline void ____napi_schedule(struct softnet_data *sd,
2493                                     struct napi_struct *napi)
2494{
2495        list_add_tail(&napi->poll_list, &sd->poll_list);
2496        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2497}
2498
2499/*
2500 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2501 * and src/dst port numbers. Returns a non-zero hash number on success
2502 * and 0 on failure.
2503 */
2504__u32 __skb_get_rxhash(struct sk_buff *skb)
2505{
2506        int nhoff, hash = 0, poff;
2507        struct ipv6hdr *ip6;
2508        struct iphdr *ip;
2509        u8 ip_proto;
2510        u32 addr1, addr2, ihl;
2511        union {
2512                u32 v32;
2513                u16 v16[2];
2514        } ports;
2515
2516        nhoff = skb_network_offset(skb);
2517
2518        switch (skb->protocol) {
2519        case __constant_htons(ETH_P_IP):
2520                if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2521                        goto done;
2522
2523                ip = (struct iphdr *) (skb->data + nhoff);
2524                if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2525                        ip_proto = 0;
2526                else
2527                        ip_proto = ip->protocol;
2528                addr1 = (__force u32) ip->saddr;
2529                addr2 = (__force u32) ip->daddr;
2530                ihl = ip->ihl;
2531                break;
2532        case __constant_htons(ETH_P_IPV6):
2533                if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2534                        goto done;
2535
2536                ip6 = (struct ipv6hdr *) (skb->data + nhoff);
2537                ip_proto = ip6->nexthdr;
2538                addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2539                addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2540                ihl = (40 >> 2);
2541                break;
2542        default:
2543                goto done;
2544        }
2545
2546        ports.v32 = 0;
2547        poff = proto_ports_offset(ip_proto);
2548        if (poff >= 0) {
2549                nhoff += ihl * 4 + poff;
2550                if (pskb_may_pull(skb, nhoff + 4)) {
2551                        ports.v32 = * (__force u32 *) (skb->data + nhoff);
2552                        if (ports.v16[1] < ports.v16[0])
2553                                swap(ports.v16[0], ports.v16[1]);
2554                }
2555        }
2556
2557        /* get a consistent hash (same value on both flow directions) */
2558        if (addr2 < addr1)
2559                swap(addr1, addr2);
2560
2561        hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2562        if (!hash)
2563                hash = 1;
2564
2565done:
2566        return hash;
2567}
2568EXPORT_SYMBOL(__skb_get_rxhash);
2569
2570#ifdef CONFIG_RPS
2571
2572/* One global table that all flow-based protocols share. */
2573struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2574EXPORT_SYMBOL(rps_sock_flow_table);
2575
2576static struct rps_dev_flow *
2577set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2578            struct rps_dev_flow *rflow, u16 next_cpu)
2579{
2580        u16 tcpu;
2581
2582        tcpu = rflow->cpu = next_cpu;
2583        if (tcpu != RPS_NO_CPU) {
2584#ifdef CONFIG_RFS_ACCEL
2585                struct netdev_rx_queue *rxqueue;
2586                struct rps_dev_flow_table *flow_table;
2587                struct rps_dev_flow *old_rflow;
2588                u32 flow_id;
2589                u16 rxq_index;
2590                int rc;
2591
2592                /* Should we steer this flow to a different hardware queue? */
2593                if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2594                    !(dev->features & NETIF_F_NTUPLE))
2595                        goto out;
2596                rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2597                if (rxq_index == skb_get_rx_queue(skb))
2598                        goto out;
2599
2600                rxqueue = dev->_rx + rxq_index;
2601                flow_table = rcu_dereference(rxqueue->rps_flow_table);
2602                if (!flow_table)
2603                        goto out;
2604                flow_id = skb->rxhash & flow_table->mask;
2605                rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2606                                                        rxq_index, flow_id);
2607                if (rc < 0)
2608                        goto out;
2609                old_rflow = rflow;
2610                rflow = &flow_table->flows[flow_id];
2611                rflow->cpu = next_cpu;
2612                rflow->filter = rc;
2613                if (old_rflow->filter == rflow->filter)
2614                        old_rflow->filter = RPS_NO_FILTER;
2615        out:
2616#endif
2617                rflow->last_qtail =
2618                        per_cpu(softnet_data, tcpu).input_queue_head;
2619        }
2620
2621        return rflow;
2622}
2623
2624/*
2625 * get_rps_cpu is called from netif_receive_skb and returns the target
2626 * CPU from the RPS map of the receiving queue for a given skb.
2627 * rcu_read_lock must be held on entry.
2628 */
2629static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2630                       struct rps_dev_flow **rflowp)
2631{
2632        struct netdev_rx_queue *rxqueue;
2633        struct rps_map *map;
2634        struct rps_dev_flow_table *flow_table;
2635        struct rps_sock_flow_table *sock_flow_table;
2636        int cpu = -1;
2637        u16 tcpu;
2638
2639        if (skb_rx_queue_recorded(skb)) {
2640                u16 index = skb_get_rx_queue(skb);
2641                if (unlikely(index >= dev->real_num_rx_queues)) {
2642                        WARN_ONCE(dev->real_num_rx_queues > 1,
2643                                  "%s received packet on queue %u, but number "
2644                                  "of RX queues is %u\n",
2645                                  dev->name, index, dev->real_num_rx_queues);
2646                        goto done;
2647                }
2648                rxqueue = dev->_rx + index;
2649        } else
2650                rxqueue = dev->_rx;
2651
2652        map = rcu_dereference(rxqueue->rps_map);
2653        if (map) {
2654                if (map->len == 1 &&
2655                    !rcu_dereference_raw(rxqueue->rps_flow_table)) {
2656                        tcpu = map->cpus[0];
2657                        if (cpu_online(tcpu))
2658                                cpu = tcpu;
2659                        goto done;
2660                }
2661        } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2662                goto done;
2663        }
2664
2665        skb_reset_network_header(skb);
2666        if (!skb_get_rxhash(skb))
2667                goto done;
2668
2669        flow_table = rcu_dereference(rxqueue->rps_flow_table);
2670        sock_flow_table = rcu_dereference(rps_sock_flow_table);
2671        if (flow_table && sock_flow_table) {
2672                u16 next_cpu;
2673                struct rps_dev_flow *rflow;
2674
2675                rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2676                tcpu = rflow->cpu;
2677
2678                next_cpu = sock_flow_table->ents[skb->rxhash &
2679                    sock_flow_table->mask];
2680
2681                /*
2682                 * If the desired CPU (where last recvmsg was done) is
2683                 * different from current CPU (one in the rx-queue flow
2684                 * table entry), switch if one of the following holds:
2685                 *   - Current CPU is unset (equal to RPS_NO_CPU).
2686                 *   - Current CPU is offline.
2687                 *   - The current CPU's queue tail has advanced beyond the
2688                 *     last packet that was enqueued using this table entry.
2689                 *     This guarantees that all previous packets for the flow
2690                 *     have been dequeued, thus preserving in order delivery.
2691                 */
2692                if (unlikely(tcpu != next_cpu) &&
2693                    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2694                     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2695                      rflow->last_qtail)) >= 0))
2696                        rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2697
2698                if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2699                        *rflowp = rflow;
2700                        cpu = tcpu;
2701                        goto done;
2702                }
2703        }
2704
2705        if (map) {
2706                tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2707
2708                if (cpu_online(tcpu)) {
2709                        cpu = tcpu;
2710                        goto done;
2711                }
2712        }
2713
2714done:
2715        return cpu;
2716}
2717
2718#ifdef CONFIG_RFS_ACCEL
2719
2720/**
2721 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2722 * @dev: Device on which the filter was set
2723 * @rxq_index: RX queue index
2724 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2725 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2726 *
2727 * Drivers that implement ndo_rx_flow_steer() should periodically call
2728 * this function for each installed filter and remove the filters for
2729 * which it returns %true.
2730 */
2731bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2732                         u32 flow_id, u16 filter_id)
2733{
2734        struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2735        struct rps_dev_flow_table *flow_table;
2736        struct rps_dev_flow *rflow;
2737        bool expire = true;
2738        int cpu;
2739
2740        rcu_read_lock();
2741        flow_table = rcu_dereference(rxqueue->rps_flow_table);
2742        if (flow_table && flow_id <= flow_table->mask) {
2743                rflow = &flow_table->flows[flow_id];
2744                cpu = ACCESS_ONCE(rflow->cpu);
2745                if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2746                    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2747                           rflow->last_qtail) <
2748                     (int)(10 * flow_table->mask)))
2749                        expire = false;
2750        }
2751        rcu_read_unlock();
2752        return expire;
2753}
2754EXPORT_SYMBOL(rps_may_expire_flow);
2755
2756#endif /* CONFIG_RFS_ACCEL */
2757
2758/* Called from hardirq (IPI) context */
2759static void rps_trigger_softirq(void *data)
2760{
2761        struct softnet_data *sd = data;
2762
2763        ____napi_schedule(sd, &sd->backlog);
2764        sd->received_rps++;
2765}
2766
2767#endif /* CONFIG_RPS */
2768
2769/*
2770 * Check if this softnet_data structure is another cpu one
2771 * If yes, queue it to our IPI list and return 1
2772 * If no, return 0
2773 */
2774static int rps_ipi_queued(struct softnet_data *sd)
2775{
2776#ifdef CONFIG_RPS
2777        struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2778
2779        if (sd != mysd) {
2780                sd->rps_ipi_next = mysd->rps_ipi_list;
2781                mysd->rps_ipi_list = sd;
2782
2783                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2784                return 1;
2785        }
2786#endif /* CONFIG_RPS */
2787        return 0;
2788}
2789
2790/*
2791 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2792 * queue (may be a remote CPU queue).
2793 */
2794static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2795                              unsigned int *qtail)
2796{
2797        struct softnet_data *sd;
2798        unsigned long flags;
2799
2800        sd = &per_cpu(softnet_data, cpu);
2801
2802        local_irq_save(flags);
2803
2804        rps_lock(sd);
2805        if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2806                if (skb_queue_len(&sd->input_pkt_queue)) {
2807enqueue:
2808                        __skb_queue_tail(&sd->input_pkt_queue, skb);
2809                        input_queue_tail_incr_save(sd, qtail);
2810                        rps_unlock(sd);
2811                        local_irq_restore(flags);
2812                        return NET_RX_SUCCESS;
2813                }
2814
2815                /* Schedule NAPI for backlog device
2816                 * We can use non atomic operation since we own the queue lock
2817                 */
2818                if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2819                        if (!rps_ipi_queued(sd))
2820                                ____napi_schedule(sd, &sd->backlog);
2821                }
2822                goto enqueue;
2823        }
2824
2825        sd->dropped++;
2826        rps_unlock(sd);
2827
2828        local_irq_restore(flags);
2829
2830        atomic_long_inc(&skb->dev->rx_dropped);
2831        kfree_skb(skb);
2832        return NET_RX_DROP;
2833}
2834
2835/**
2836 *      netif_rx        -       post buffer to the network code
2837 *      @skb: buffer to post
2838 *
2839 *      This function receives a packet from a device driver and queues it for
2840 *      the upper (protocol) levels to process.  It always succeeds. The buffer
2841 *      may be dropped during processing for congestion control or by the
2842 *      protocol layers.
2843 *
2844 *      return values:
2845 *      NET_RX_SUCCESS  (no congestion)
2846 *      NET_RX_DROP     (packet was dropped)
2847 *
2848 */
2849
2850int netif_rx(struct sk_buff *skb)
2851{
2852        int ret;
2853
2854        /* if netpoll wants it, pretend we never saw it */
2855        if (netpoll_rx(skb))
2856                return NET_RX_DROP;
2857
2858        if (netdev_tstamp_prequeue)
2859                net_timestamp_check(skb);
2860
2861        trace_netif_rx(skb);
2862#ifdef CONFIG_RPS
2863        {
2864                struct rps_dev_flow voidflow, *rflow = &voidflow;
2865                int cpu;
2866
2867                preempt_disable();
2868                rcu_read_lock();
2869
2870                cpu = get_rps_cpu(skb->dev, skb, &rflow);
2871                if (cpu < 0)
2872                        cpu = smp_processor_id();
2873
2874                ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2875
2876                rcu_read_unlock();
2877                preempt_enable();
2878        }
2879#else
2880        {
2881                unsigned int qtail;
2882                ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2883                put_cpu();
2884        }
2885#endif
2886        return ret;
2887}
2888EXPORT_SYMBOL(netif_rx);
2889
2890int netif_rx_ni(struct sk_buff *skb)
2891{
2892        int err;
2893
2894        preempt_disable();
2895        err = netif_rx(skb);
2896        if (local_softirq_pending())
2897                do_softirq();
2898        preempt_enable();
2899
2900        return err;
2901}
2902EXPORT_SYMBOL(netif_rx_ni);
2903
2904static void net_tx_action(struct softirq_action *h)
2905{
2906        struct softnet_data *sd = &__get_cpu_var(softnet_data);
2907
2908        if (sd->completion_queue) {
2909                struct sk_buff *clist;
2910
2911                local_irq_disable();
2912                clist = sd->completion_queue;
2913                sd->completion_queue = NULL;
2914                local_irq_enable();
2915
2916                while (clist) {
2917                        struct sk_buff *skb = clist;
2918                        clist = clist->next;
2919
2920                        WARN_ON(atomic_read(&skb->users));
2921                        trace_kfree_skb(skb, net_tx_action);
2922                        __kfree_skb(skb);
2923                }
2924        }
2925
2926        if (sd->output_queue) {
2927                struct Qdisc *head;
2928
2929                local_irq_disable();
2930                head = sd->output_queue;
2931                sd->output_queue = NULL;
2932                sd->output_queue_tailp = &sd->output_queue;
2933                local_irq_enable();
2934
2935                while (head) {
2936                        struct Qdisc *q = head;
2937                        spinlock_t *root_lock;
2938
2939                        head = head->next_sched;
2940
2941                        root_lock = qdisc_lock(q);
2942                        if (spin_trylock(root_lock)) {
2943                                smp_mb__before_clear_bit();
2944                                clear_bit(__QDISC_STATE_SCHED,
2945                                          &q->state);
2946                                qdisc_run(q);
2947                                spin_unlock(root_lock);
2948                        } else {
2949                                if (!test_bit(__QDISC_STATE_DEACTIVATED,
2950                                              &q->state)) {
2951                                        __netif_reschedule(q);
2952                                } else {
2953                                        smp_mb__before_clear_bit();
2954                                        clear_bit(__QDISC_STATE_SCHED,
2955                                                  &q->state);
2956                                }
2957                        }
2958                }
2959        }
2960}
2961
2962#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2963    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2964/* This hook is defined here for ATM LANE */
2965int (*br_fdb_test_addr_hook)(struct net_device *dev,
2966                             unsigned char *addr) __read_mostly;
2967EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2968#endif
2969
2970#ifdef CONFIG_NET_CLS_ACT
2971/* TODO: Maybe we should just force sch_ingress to be compiled in
2972 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2973 * a compare and 2 stores extra right now if we dont have it on
2974 * but have CONFIG_NET_CLS_ACT
2975 * NOTE: This doesn't stop any functionality; if you dont have
2976 * the ingress scheduler, you just can't add policies on ingress.
2977 *
2978 */
2979static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2980{
2981        struct net_device *dev = skb->dev;
2982        u32 ttl = G_TC_RTTL(skb->tc_verd);
2983        int result = TC_ACT_OK;
2984        struct Qdisc *q;
2985
2986        if (unlikely(MAX_RED_LOOP < ttl++)) {
2987                if (net_ratelimit())
2988                        pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2989                               skb->skb_iif, dev->ifindex);
2990                return TC_ACT_SHOT;
2991        }
2992
2993        skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2994        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2995
2996        q = rxq->qdisc;
2997        if (q != &noop_qdisc) {
2998                spin_lock(qdisc_lock(q));
2999                if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3000                        result = qdisc_enqueue_root(skb, q);
3001                spin_unlock(qdisc_lock(q));
3002        }
3003
3004        return result;
3005}
3006
3007static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3008                                         struct packet_type **pt_prev,
3009                                         int *ret, struct net_device *orig_dev)
3010{
3011        struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3012
3013        if (!rxq || rxq->qdisc == &noop_qdisc)
3014                goto out;
3015
3016        if (*pt_prev) {
3017                *ret = deliver_skb(skb, *pt_prev, orig_dev);
3018                *pt_prev = NULL;
3019        }
3020
3021        switch (ing_filter(skb, rxq)) {
3022        case TC_ACT_SHOT:
3023        case TC_ACT_STOLEN:
3024                kfree_skb(skb);
3025                return NULL;
3026        }
3027
3028out:
3029        skb->tc_verd = 0;
3030        return skb;
3031}
3032#endif
3033
3034/**
3035 *      netdev_rx_handler_register - register receive handler
3036 *      @dev: device to register a handler for
3037 *      @rx_handler: receive handler to register
3038 *      @rx_handler_data: data pointer that is used by rx handler
3039 *
3040 *      Register a receive hander for a device. This handler will then be
3041 *      called from __netif_receive_skb. A negative errno code is returned
3042 *      on a failure.
3043 *
3044 *      The caller must hold the rtnl_mutex.
3045 *
3046 *      For a general description of rx_handler, see enum rx_handler_result.
3047 */
3048int netdev_rx_handler_register(struct net_device *dev,
3049                               rx_handler_func_t *rx_handler,
3050                               void *rx_handler_data)
3051{
3052        ASSERT_RTNL();
3053
3054        if (dev->rx_handler)
3055                return -EBUSY;
3056
3057        rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3058        rcu_assign_pointer(dev->rx_handler, rx_handler);
3059
3060        return 0;
3061}
3062EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3063
3064/**
3065 *      netdev_rx_handler_unregister - unregister receive handler
3066 *      @dev: device to unregister a handler from
3067 *
3068 *      Unregister a receive hander from a device.
3069 *
3070 *      The caller must hold the rtnl_mutex.
3071 */
3072void netdev_rx_handler_unregister(struct net_device *dev)
3073{
3074
3075        ASSERT_RTNL();
3076        rcu_assign_pointer(dev->rx_handler, NULL);
3077        rcu_assign_pointer(dev->rx_handler_data, NULL);
3078}
3079EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3080
3081static void vlan_on_bond_hook(struct sk_buff *skb)
3082{
3083        /*
3084         * Make sure ARP frames received on VLAN interfaces stacked on
3085         * bonding interfaces still make their way to any base bonding
3086         * device that may have registered for a specific ptype.
3087         */
3088        if (skb->dev->priv_flags & IFF_802_1Q_VLAN &&
3089            vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING &&
3090            skb->protocol == htons(ETH_P_ARP)) {
3091                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
3092
3093                if (!skb2)
3094                        return;
3095                skb2->dev = vlan_dev_real_dev(skb->dev);
3096                netif_rx(skb2);
3097        }
3098}
3099
3100static int __netif_receive_skb(struct sk_buff *skb)
3101{
3102        struct packet_type *ptype, *pt_prev;
3103        rx_handler_func_t *rx_handler;
3104        struct net_device *orig_dev;
3105        struct net_device *null_or_dev;
3106        bool deliver_exact = false;
3107        int ret = NET_RX_DROP;
3108        __be16 type;
3109
3110        if (!netdev_tstamp_prequeue)
3111                net_timestamp_check(skb);
3112
3113        trace_netif_receive_skb(skb);
3114
3115        /* if we've gotten here through NAPI, check netpoll */
3116        if (netpoll_receive_skb(skb))
3117                return NET_RX_DROP;
3118
3119        if (!skb->skb_iif)
3120                skb->skb_iif = skb->dev->ifindex;
3121        orig_dev = skb->dev;
3122
3123        skb_reset_network_header(skb);
3124        skb_reset_transport_header(skb);
3125        skb->mac_len = skb->network_header - skb->mac_header;
3126
3127        pt_prev = NULL;
3128
3129        rcu_read_lock();
3130
3131another_round:
3132
3133        __this_cpu_inc(softnet_data.processed);
3134
3135#ifdef CONFIG_NET_CLS_ACT
3136        if (skb->tc_verd & TC_NCLS) {
3137                skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3138                goto ncls;
3139        }
3140#endif
3141
3142        list_for_each_entry_rcu(ptype, &ptype_all, list) {
3143                if (!ptype->dev || ptype->dev == skb->dev) {
3144                        if (pt_prev)
3145                                ret = deliver_skb(skb, pt_prev, orig_dev);
3146                        pt_prev = ptype;
3147                }
3148        }
3149
3150#ifdef CONFIG_NET_CLS_ACT
3151        skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3152        if (!skb)
3153                goto out;
3154ncls:
3155#endif
3156
3157        rx_handler = rcu_dereference(skb->dev->rx_handler);
3158        if (rx_handler) {
3159                if (pt_prev) {
3160                        ret = deliver_skb(skb, pt_prev, orig_dev);
3161                        pt_prev = NULL;
3162                }
3163                switch (rx_handler(&skb)) {
3164                case RX_HANDLER_CONSUMED:
3165                        goto out;
3166                case RX_HANDLER_ANOTHER:
3167                        goto another_round;
3168                case RX_HANDLER_EXACT:
3169                        deliver_exact = true;
3170                case RX_HANDLER_PASS:
3171                        break;
3172                default:
3173                        BUG();
3174                }
3175        }
3176
3177        if (vlan_tx_tag_present(skb)) {
3178                if (pt_prev) {
3179                        ret = deliver_skb(skb, pt_prev, orig_dev);
3180                        pt_prev = NULL;
3181                }
3182                if (vlan_hwaccel_do_receive(&skb)) {
3183                        ret = __netif_receive_skb(skb);
3184                        goto out;
3185                } else if (unlikely(!skb))
3186                        goto out;
3187        }
3188
3189        vlan_on_bond_hook(skb);
3190
3191        /* deliver only exact match when indicated */
3192        null_or_dev = deliver_exact ? skb->dev : NULL;
3193
3194        type = skb->protocol;
3195        list_for_each_entry_rcu(ptype,
3196                        &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3197                if (ptype->type == type &&
3198                    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3199                     ptype->dev == orig_dev)) {
3200                        if (pt_prev)
3201                                ret = deliver_skb(skb, pt_prev, orig_dev);
3202                        pt_prev = ptype;
3203                }
3204        }
3205
3206        if (pt_prev) {
3207                ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3208        } else {
3209                atomic_long_inc(&skb->dev->rx_dropped);
3210                kfree_skb(skb);
3211                /* Jamal, now you will not able to escape explaining
3212                 * me how you were going to use this. :-)
3213                 */
3214                ret = NET_RX_DROP;
3215        }
3216
3217out:
3218        rcu_read_unlock();
3219        return ret;
3220}
3221
3222/**
3223 *      netif_receive_skb - process receive buffer from network
3224 *      @skb: buffer to process
3225 *
3226 *      netif_receive_skb() is the main receive data processing function.
3227 *      It always succeeds. The buffer may be dropped during processing
3228 *      for congestion control or by the protocol layers.
3229 *
3230 *      This function may only be called from softirq context and interrupts
3231 *      should be enabled.
3232 *
3233 *      Return values (usually ignored):
3234 *      NET_RX_SUCCESS: no congestion
3235 *      NET_RX_DROP: packet was dropped
3236 */
3237int netif_receive_skb(struct sk_buff *skb)
3238{
3239        if (netdev_tstamp_prequeue)
3240                net_timestamp_check(skb);
3241
3242        if (skb_defer_rx_timestamp(skb))
3243                return NET_RX_SUCCESS;
3244
3245#ifdef CONFIG_RPS
3246        {
3247                struct rps_dev_flow voidflow, *rflow = &voidflow;
3248                int cpu, ret;
3249
3250                rcu_read_lock();
3251
3252                cpu = get_rps_cpu(skb->dev, skb, &rflow);
3253
3254                if (cpu >= 0) {
3255                        ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3256                        rcu_read_unlock();
3257                } else {
3258                        rcu_read_unlock();
3259                        ret = __netif_receive_skb(skb);
3260                }
3261
3262                return ret;
3263        }
3264#else
3265        return __netif_receive_skb(skb);
3266#endif
3267}
3268EXPORT_SYMBOL(netif_receive_skb);
3269
3270/* Network device is going away, flush any packets still pending
3271 * Called with irqs disabled.
3272 */
3273static void flush_backlog(void *arg)
3274{
3275        struct net_device *dev = arg;
3276        struct softnet_data *sd = &__get_cpu_var(softnet_data);
3277        struct sk_buff *skb, *tmp;
3278
3279        rps_lock(sd);
3280        skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3281                if (skb->dev == dev) {
3282                        __skb_unlink(skb, &sd->input_pkt_queue);
3283                        kfree_skb(skb);
3284                        input_queue_head_incr(sd);
3285                }
3286        }
3287        rps_unlock(sd);
3288
3289        skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3290                if (skb->dev == dev) {
3291                        __skb_unlink(skb, &sd->process_queue);
3292                        kfree_skb(skb);
3293                        input_queue_head_incr(sd);
3294                }
3295        }
3296}
3297
3298static int napi_gro_complete(struct sk_buff *skb)
3299{
3300        struct packet_type *ptype;
3301        __be16 type = skb->protocol;
3302        struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3303        int err = -ENOENT;
3304
3305        if (NAPI_GRO_CB(skb)->count == 1) {
3306                skb_shinfo(skb)->gso_size = 0;
3307                goto out;
3308        }
3309
3310        rcu_read_lock();
3311        list_for_each_entry_rcu(ptype, head, list) {
3312                if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3313                        continue;
3314
3315                err = ptype->gro_complete(skb);
3316                break;
3317        }
3318        rcu_read_unlock();
3319
3320        if (err) {
3321                WARN_ON(&ptype->list == head);
3322                kfree_skb(skb);
3323                return NET_RX_SUCCESS;
3324        }
3325
3326out:
3327        return netif_receive_skb(skb);
3328}
3329
3330inline void napi_gro_flush(struct napi_struct *napi)
3331{
3332        struct sk_buff *skb, *next;
3333
3334        for (skb = napi->gro_list; skb; skb = next) {
3335                next = skb->next;
3336                skb->next = NULL;
3337                napi_gro_complete(skb);
3338        }
3339
3340        napi->gro_count = 0;
3341        napi->gro_list = NULL;
3342}
3343EXPORT_SYMBOL(napi_gro_flush);
3344
3345enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3346{
3347        struct sk_buff **pp = NULL;
3348        struct packet_type *ptype;
3349        __be16 type = skb->protocol;
3350        struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3351        int same_flow;
3352        int mac_len;
3353        enum gro_result ret;
3354
3355        if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3356                goto normal;
3357
3358        if (skb_is_gso(skb) || skb_has_frag_list(skb))
3359                goto normal;
3360
3361        rcu_read_lock();
3362        list_for_each_entry_rcu(ptype, head, list) {
3363                if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3364                        continue;
3365
3366                skb_set_network_header(skb, skb_gro_offset(skb));
3367                mac_len = skb->network_header - skb->mac_header;
3368                skb->mac_len = mac_len;
3369                NAPI_GRO_CB(skb)->same_flow = 0;
3370                NAPI_GRO_CB(skb)->flush = 0;
3371                NAPI_GRO_CB(skb)->free = 0;
3372
3373                pp = ptype->gro_receive(&napi->gro_list, skb);
3374                break;
3375        }
3376        rcu_read_unlock();
3377
3378        if (&ptype->list == head)
3379                goto normal;
3380
3381        same_flow = NAPI_GRO_CB(skb)->same_flow;
3382        ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3383
3384        if (pp) {
3385                struct sk_buff *nskb = *pp;
3386
3387                *pp = nskb->next;
3388                nskb->next = NULL;
3389                napi_gro_complete(nskb);
3390                napi->gro_count--;
3391        }
3392
3393        if (same_flow)
3394                goto ok;
3395
3396        if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3397                goto normal;
3398
3399        napi->gro_count++;
3400        NAPI_GRO_CB(skb)->count = 1;
3401        skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3402        skb->next = napi->gro_list;
3403        napi->gro_list = skb;
3404        ret = GRO_HELD;
3405
3406pull:
3407        if (skb_headlen(skb) < skb_gro_offset(skb)) {
3408                int grow = skb_gro_offset(skb) - skb_headlen(skb);
3409
3410                BUG_ON(skb->end - skb->tail < grow);
3411
3412                memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3413
3414                skb->tail += grow;
3415                skb->data_len -= grow;
3416
3417                skb_shinfo(skb)->frags[0].page_offset += grow;
3418                skb_shinfo(skb)->frags[0].size -= grow;
3419
3420                if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3421                        put_page(skb_shinfo(skb)->frags[0].page);
3422                        memmove(skb_shinfo(skb)->frags,
3423                                skb_shinfo(skb)->frags + 1,
3424                                --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3425                }
3426        }
3427
3428ok:
3429        return ret;
3430
3431normal:
3432        ret = GRO_NORMAL;
3433        goto pull;
3434}
3435EXPORT_SYMBOL(dev_gro_receive);
3436
3437static inline gro_result_t
3438__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3439{
3440        struct sk_buff *p;
3441
3442        for (p = napi->gro_list; p; p = p->next) {
3443                unsigned long diffs;
3444
3445                diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3446                diffs |= p->vlan_tci ^ skb->vlan_tci;
3447                diffs |= compare_ether_header(skb_mac_header(p),
3448                                              skb_gro_mac_header(skb));
3449                NAPI_GRO_CB(p)->same_flow = !diffs;
3450                NAPI_GRO_CB(p)->flush = 0;
3451        }
3452
3453        return dev_gro_receive(napi, skb);
3454}
3455
3456gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3457{
3458        switch (ret) {
3459        case GRO_NORMAL:
3460                if (netif_receive_skb(skb))
3461                        ret = GRO_DROP;
3462                break;
3463
3464        case GRO_DROP:
3465        case GRO_MERGED_FREE:
3466                kfree_skb(skb);
3467                break;
3468
3469        case GRO_HELD:
3470        case GRO_MERGED:
3471                break;
3472        }
3473
3474        return ret;
3475}
3476EXPORT_SYMBOL(napi_skb_finish);
3477
3478void skb_gro_reset_offset(struct sk_buff *skb)
3479{
3480        NAPI_GRO_CB(skb)->data_offset = 0;
3481        NAPI_GRO_CB(skb)->frag0 = NULL;
3482        NAPI_GRO_CB(skb)->frag0_len = 0;
3483
3484        if (skb->mac_header == skb->tail &&
3485            !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3486                NAPI_GRO_CB(skb)->frag0 =
3487                        page_address(skb_shinfo(skb)->frags[0].page) +
3488                        skb_shinfo(skb)->frags[0].page_offset;
3489                NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3490        }
3491}
3492EXPORT_SYMBOL(skb_gro_reset_offset);
3493
3494gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3495{
3496        skb_gro_reset_offset(skb);
3497
3498        return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3499}
3500EXPORT_SYMBOL(napi_gro_receive);
3501
3502static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3503{
3504        __skb_pull(skb, skb_headlen(skb));
3505        skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3506        skb->vlan_tci = 0;
3507        skb->dev = napi->dev;
3508        skb->skb_iif = 0;
3509
3510        napi->skb = skb;
3511}
3512
3513struct sk_buff *napi_get_frags(struct napi_struct *napi)
3514{
3515        struct sk_buff *skb = napi->skb;
3516
3517        if (!skb) {
3518                skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3519                if (skb)
3520                        napi->skb = skb;
3521        }
3522        return skb;
3523}
3524EXPORT_SYMBOL(napi_get_frags);
3525
3526gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3527                               gro_result_t ret)
3528{
3529        switch (ret) {
3530        case GRO_NORMAL:
3531        case GRO_HELD:
3532                skb->protocol = eth_type_trans(skb, skb->dev);
3533
3534                if (ret == GRO_HELD)
3535                        skb_gro_pull(skb, -ETH_HLEN);
3536                else if (netif_receive_skb(skb))
3537                        ret = GRO_DROP;
3538                break;
3539
3540        case GRO_DROP:
3541        case GRO_MERGED_FREE:
3542                napi_reuse_skb(napi, skb);
3543                break;
3544
3545        case GRO_MERGED:
3546                break;
3547        }
3548
3549        return ret;
3550}
3551EXPORT_SYMBOL(napi_frags_finish);
3552
3553struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3554{
3555        struct sk_buff *skb = napi->skb;
3556        struct ethhdr *eth;
3557        unsigned int hlen;
3558        unsigned int off;
3559
3560        napi->skb = NULL;
3561
3562        skb_reset_mac_header(skb);
3563        skb_gro_reset_offset(skb);
3564
3565        off = skb_gro_offset(skb);
3566        hlen = off + sizeof(*eth);
3567        eth = skb_gro_header_fast(skb, off);
3568        if (skb_gro_header_hard(skb, hlen)) {
3569                eth = skb_gro_header_slow(skb, hlen, off);
3570                if (unlikely(!eth)) {
3571                        napi_reuse_skb(napi, skb);
3572                        skb = NULL;
3573                        goto out;
3574                }
3575        }
3576
3577        skb_gro_pull(skb, sizeof(*eth));
3578
3579        /*
3580         * This works because the only protocols we care about don't require
3581         * special handling.  We'll fix it up properly at the end.
3582         */
3583        skb->protocol = eth->h_proto;
3584
3585out:
3586        return skb;
3587}
3588EXPORT_SYMBOL(napi_frags_skb);
3589
3590gro_result_t napi_gro_frags(struct napi_struct *napi)
3591{
3592        struct sk_buff *skb = napi_frags_skb(napi);
3593
3594        if (!skb)
3595                return GRO_DROP;
3596
3597        return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3598}
3599EXPORT_SYMBOL(napi_gro_frags);
3600
3601/*
3602 * net_rps_action sends any pending IPI's for rps.
3603 * Note: called with local irq disabled, but exits with local irq enabled.
3604 */
3605static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3606{
3607#ifdef CONFIG_RPS
3608        struct softnet_data *remsd = sd->rps_ipi_list;
3609
3610        if (remsd) {
3611                sd->rps_ipi_list = NULL;
3612
3613                local_irq_enable();
3614
3615                /* Send pending IPI's to kick RPS processing on remote cpus. */
3616                while (remsd) {
3617                        struct softnet_data *next = remsd->rps_ipi_next;
3618
3619                        if (cpu_online(remsd->cpu))
3620                                __smp_call_function_single(remsd->cpu,
3621                                                           &remsd->csd, 0);
3622                        remsd = next;
3623                }
3624        } else
3625#endif
3626                local_irq_enable();
3627}
3628
3629static int process_backlog(struct napi_struct *napi, int quota)
3630{
3631        int work = 0;
3632        struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3633
3634#ifdef CONFIG_RPS
3635        /* Check if we have pending ipi, its better to send them now,
3636         * not waiting net_rx_action() end.
3637         */
3638        if (sd->rps_ipi_list) {
3639                local_irq_disable();
3640                net_rps_action_and_irq_enable(sd);
3641        }
3642#endif
3643        napi->weight = weight_p;
3644        local_irq_disable();
3645        while (work < quota) {
3646                struct sk_buff *skb;
3647                unsigned int qlen;
3648
3649                while ((skb = __skb_dequeue(&sd->process_queue))) {
3650                        local_irq_enable();
3651                        __netif_receive_skb(skb);
3652                        local_irq_disable();
3653                        input_queue_head_incr(sd);
3654                        if (++work >= quota) {
3655                                local_irq_enable();
3656                                return work;
3657                        }
3658                }
3659
3660                rps_lock(sd);
3661                qlen = skb_queue_len(&sd->input_pkt_queue);
3662                if (qlen)
3663                        skb_queue_splice_tail_init(&sd->input_pkt_queue,
3664                                                   &sd->process_queue);
3665
3666                if (qlen < quota - work) {
3667                        /*
3668                         * Inline a custom version of __napi_complete().
3669                         * only current cpu owns and manipulates this napi,
3670                         * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3671                         * we can use a plain write instead of clear_bit(),
3672                         * and we dont need an smp_mb() memory barrier.
3673                         */
3674                        list_del(&napi->poll_list);
3675                        napi->state = 0;
3676
3677                        quota = work + qlen;
3678                }
3679                rps_unlock(sd);
3680        }
3681        local_irq_enable();
3682
3683        return work;
3684}
3685
3686/**
3687 * __napi_schedule - schedule for receive
3688 * @n: entry to schedule
3689 *
3690 * The entry's receive function will be scheduled to run
3691 */
3692void __napi_schedule(struct napi_struct *n)
3693{
3694        unsigned long flags;
3695
3696        local_irq_save(flags);
3697        ____napi_schedule(&__get_cpu_var(softnet_data), n);
3698        local_irq_restore(flags);
3699}
3700EXPORT_SYMBOL(__napi_schedule);
3701
3702void __napi_complete(struct napi_struct *n)
3703{
3704        BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3705        BUG_ON(n->gro_list);
3706
3707        list_del(&n->poll_list);
3708        smp_mb__before_clear_bit();
3709        clear_bit(NAPI_STATE_SCHED, &n->state);
3710}
3711EXPORT_SYMBOL(__napi_complete);
3712
3713void napi_complete(struct napi_struct *n)
3714{
3715        unsigned long flags;
3716
3717        /*
3718         * don't let napi dequeue from the cpu poll list
3719         * just in case its running on a different cpu
3720         */
3721        if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3722                return;
3723
3724        napi_gro_flush(n);
3725        local_irq_save(flags);
3726        __napi_complete(n);
3727        local_irq_restore(flags);
3728}
3729EXPORT_SYMBOL(napi_complete);
3730
3731void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3732                    int (*poll)(struct napi_struct *, int), int weight)
3733{
3734        INIT_LIST_HEAD(&napi->poll_list);
3735        napi->gro_count = 0;
3736        napi->gro_list = NULL;
3737        napi->skb = NULL;
3738        napi->poll = poll;
3739        napi->weight = weight;
3740        list_add(&napi->dev_list, &dev->napi_list);
3741        napi->dev = dev;
3742#ifdef CONFIG_NETPOLL
3743        spin_lock_init(&napi->poll_lock);
3744        napi->poll_owner = -1;
3745#endif
3746        set_bit(NAPI_STATE_SCHED, &napi->state);
3747}
3748EXPORT_SYMBOL(netif_napi_add);
3749
3750void netif_napi_del(struct napi_struct *napi)
3751{
3752        struct sk_buff *skb, *next;
3753
3754        list_del_init(&napi->dev_list);
3755        napi_free_frags(napi);
3756
3757        for (skb = napi->gro_list; skb; skb = next) {
3758                next = skb->next;
3759                skb->next = NULL;
3760                kfree_skb(skb);
3761        }
3762
3763        napi->gro_list = NULL;
3764        napi->gro_count = 0;
3765}
3766EXPORT_SYMBOL(netif_napi_del);
3767
3768static void net_rx_action(struct softirq_action *h)
3769{
3770        struct softnet_data *sd = &__get_cpu_var(softnet_data);
3771        unsigned long time_limit = jiffies + 2;
3772        int budget = netdev_budget;
3773        void *have;
3774
3775        local_irq_disable();
3776
3777        while (!list_empty(&sd->poll_list)) {
3778                struct napi_struct *n;
3779                int work, weight;
3780
3781                /* If softirq window is exhuasted then punt.
3782                 * Allow this to run for 2 jiffies since which will allow
3783                 * an average latency of 1.5/HZ.
3784                 */
3785                if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3786                        goto softnet_break;
3787
3788                local_irq_enable();
3789
3790                /* Even though interrupts have been re-enabled, this
3791                 * access is safe because interrupts can only add new
3792                 * entries to the tail of this list, and only ->poll()
3793                 * calls can remove this head entry from the list.
3794                 */
3795                n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3796
3797                have = netpoll_poll_lock(n);
3798
3799                weight = n->weight;
3800
3801                /* This NAPI_STATE_SCHED test is for avoiding a race
3802                 * with netpoll's poll_napi().  Only the entity which
3803                 * obtains the lock and sees NAPI_STATE_SCHED set will
3804                 * actually make the ->poll() call.  Therefore we avoid
3805                 * accidentally calling ->poll() when NAPI is not scheduled.
3806                 */
3807                work = 0;
3808                if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3809                        work = n->poll(n, weight);
3810                        trace_napi_poll(n);
3811                }
3812
3813                WARN_ON_ONCE(work > weight);
3814
3815                budget -= work;
3816
3817                local_irq_disable();
3818
3819                /* Drivers must not modify the NAPI state if they
3820                 * consume the entire weight.  In such cases this code
3821                 * still "owns" the NAPI instance and therefore can
3822                 * move the instance around on the list at-will.
3823                 */
3824                if (unlikely(work == weight)) {
3825                        if (unlikely(napi_disable_pending(n))) {
3826                                local_irq_enable();
3827                                napi_complete(n);
3828                                local_irq_disable();
3829                        } else
3830                                list_move_tail(&n->poll_list, &sd->poll_list);
3831                }
3832
3833                netpoll_poll_unlock(have);
3834        }
3835out:
3836        net_rps_action_and_irq_enable(sd);
3837
3838#ifdef CONFIG_NET_DMA
3839        /*
3840         * There may not be any more sk_buffs coming right now, so push
3841         * any pending DMA copies to hardware
3842         */
3843        dma_issue_pending_all();
3844#endif
3845
3846        return;
3847
3848softnet_break:
3849        sd->time_squeeze++;
3850        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3851        goto out;
3852}
3853
3854static gifconf_func_t *gifconf_list[NPROTO];
3855
3856/**
3857 *      register_gifconf        -       register a SIOCGIF handler
3858 *      @family: Address family
3859 *      @gifconf: Function handler
3860 *
3861 *      Register protocol dependent address dumping routines. The handler
3862 *      that is passed must not be freed or reused until it has been replaced
3863 *      by another handler.
3864 */
3865int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3866{
3867        if (family >= NPROTO)
3868                return -EINVAL;
3869        gifconf_list[family] = gifconf;
3870        return 0;
3871}
3872EXPORT_SYMBOL(register_gifconf);
3873
3874
3875/*
3876 *      Map an interface index to its name (SIOCGIFNAME)
3877 */
3878
3879/*
3880 *      We need this ioctl for efficient implementation of the
3881 *      if_indextoname() function required by the IPv6 API.  Without
3882 *      it, we would have to search all the interfaces to find a
3883 *      match.  --pb
3884 */
3885
3886static int dev_ifname(struct net *net, struct ifreq __user *arg)
3887{
3888        struct net_device *dev;
3889        struct ifreq ifr;
3890
3891        /*
3892         *      Fetch the caller's info block.
3893         */
3894
3895        if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3896                return -EFAULT;
3897
3898        rcu_read_lock();
3899        dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3900        if (!dev) {
3901                rcu_read_unlock();
3902                return -ENODEV;
3903        }
3904
3905        strcpy(ifr.ifr_name, dev->name);
3906        rcu_read_unlock();
3907
3908        if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3909                return -EFAULT;
3910        return 0;
3911}
3912
3913/*
3914 *      Perform a SIOCGIFCONF call. This structure will change
3915 *      size eventually, and there is nothing I can do about it.
3916 *      Thus we will need a 'compatibility mode'.
3917 */
3918
3919static int dev_ifconf(struct net *net, char __user *arg)
3920{
3921        struct ifconf ifc;
3922        struct net_device *dev;
3923        char __user *pos;
3924        int len;
3925        int total;
3926        int i;
3927
3928        /*
3929         *      Fetch the caller's info block.
3930         */
3931
3932        if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3933                return -EFAULT;
3934
3935        pos = ifc.ifc_buf;
3936        len = ifc.ifc_len;
3937
3938        /*
3939         *      Loop over the interfaces, and write an info block for each.
3940         */
3941
3942        total = 0;
3943        for_each_netdev(net, dev) {
3944                for (i = 0; i < NPROTO; i++) {
3945                        if (gifconf_list[i]) {
3946                                int done;
3947                                if (!pos)
3948                                        done = gifconf_list[i](dev, NULL, 0);
3949                                else
3950                                        done = gifconf_list[i](dev, pos + total,
3951                                                               len - total);
3952                                if (done < 0)
3953                                        return -EFAULT;
3954                                total += done;
3955                        }
3956                }
3957        }
3958
3959        /*
3960         *      All done.  Write the updated control block back to the caller.
3961         */
3962        ifc.ifc_len = total;
3963
3964        /*
3965         *      Both BSD and Solaris return 0 here, so we do too.
3966         */
3967        return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3968}
3969
3970#ifdef CONFIG_PROC_FS
3971/*
3972 *      This is invoked by the /proc filesystem handler to display a device
3973 *      in detail.
3974 */
3975void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3976        __acquires(RCU)
3977{
3978        struct net *net = seq_file_net(seq);
3979        loff_t off;
3980        struct net_device *dev;
3981
3982        rcu_read_lock();
3983        if (!*pos)
3984                return SEQ_START_TOKEN;
3985
3986        off = 1;
3987        for_each_netdev_rcu(net, dev)
3988                if (off++ == *pos)
3989                        return dev;
3990
3991        return NULL;
3992}
3993
3994void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3995{
3996        struct net_device *dev = v;
3997
3998        if (v == SEQ_START_TOKEN)
3999                dev = first_net_device_rcu(seq_file_net(seq));
4000        else
4001                dev = next_net_device_rcu(dev);
4002
4003        ++*pos;
4004        return dev;
4005}
4006
4007void dev_seq_stop(struct seq_file *seq, void *v)
4008        __releases(RCU)
4009{
4010        rcu_read_unlock();
4011}
4012
4013static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4014{
4015        struct rtnl_link_stats64 temp;
4016        const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4017
4018        seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4019                   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4020                   dev->name, stats->rx_bytes, stats->rx_packets,
4021                   stats->rx_errors,
4022                   stats->rx_dropped + stats->rx_missed_errors,
4023                   stats->rx_fifo_errors,
4024                   stats->rx_length_errors + stats->rx_over_errors +
4025                    stats->rx_crc_errors + stats->rx_frame_errors,
4026                   stats->rx_compressed, stats->multicast,
4027                   stats->tx_bytes, stats->tx_packets,
4028                   stats->tx_errors, stats->tx_dropped,
4029                   stats->tx_fifo_errors, stats->collisions,
4030                   stats->tx_carrier_errors +
4031                    stats->tx_aborted_errors +
4032                    stats->tx_window_errors +
4033                    stats->tx_heartbeat_errors,
4034                   stats->tx_compressed);
4035}
4036
4037/*
4038 *      Called from the PROCfs module. This now uses the new arbitrary sized
4039 *      /proc/net interface to create /proc/net/dev
4040 */
4041static int dev_seq_show(struct seq_file *seq, void *v)
4042{
4043        if (v == SEQ_START_TOKEN)
4044                seq_puts(seq, "Inter-|   Receive                            "
4045                              "                    |  Transmit\n"
4046                              " face |bytes    packets errs drop fifo frame "
4047                              "compressed multicast|bytes    packets errs "
4048                              "drop fifo colls carrier compressed\n");
4049        else
4050                dev_seq_printf_stats(seq, v);
4051        return 0;
4052}
4053
4054static struct softnet_data *softnet_get_online(loff_t *pos)
4055{
4056        struct softnet_data *sd = NULL;
4057
4058        while (*pos < nr_cpu_ids)
4059                if (cpu_online(*pos)) {
4060                        sd = &per_cpu(softnet_data, *pos);
4061                        break;
4062                } else
4063                        ++*pos;
4064        return sd;
4065}
4066
4067static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4068{
4069        return softnet_get_online(pos);
4070}
4071
4072static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4073{
4074        ++*pos;
4075        return softnet_get_online(pos);
4076}
4077
4078static void softnet_seq_stop(struct seq_file *seq, void *v)
4079{
4080}
4081
4082static int softnet_seq_show(struct seq_file *seq, void *v)
4083{
4084        struct softnet_data *sd = v;
4085
4086        seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4087                   sd->processed, sd->dropped, sd->time_squeeze, 0,
4088                   0, 0, 0, 0, /* was fastroute */
4089                   sd->cpu_collision, sd->received_rps);
4090        return 0;
4091}
4092
4093static const struct seq_operations dev_seq_ops = {
4094        .start = dev_seq_start,
4095        .next  = dev_seq_next,
4096        .stop  = dev_seq_stop,
4097        .show  = dev_seq_show,
4098};
4099
4100static int dev_seq_open(struct inode *inode, struct file *file)
4101{
4102        return seq_open_net(inode, file, &dev_seq_ops,
4103                            sizeof(struct seq_net_private));
4104}
4105
4106static const struct file_operations dev_seq_fops = {
4107        .owner   = THIS_MODULE,
4108        .open    = dev_seq_open,
4109        .read    = seq_read,
4110        .llseek  = seq_lseek,
4111        .release = seq_release_net,
4112};
4113
4114static const struct seq_operations softnet_seq_ops = {
4115        .start = softnet_seq_start,
4116        .next  = softnet_seq_next,
4117        .stop  = softnet_seq_stop,
4118        .show  = softnet_seq_show,
4119};
4120
4121static int softnet_seq_open(struct inode *inode, struct file *file)
4122{
4123        return seq_open(file, &softnet_seq_ops);
4124}
4125
4126static const struct file_operations softnet_seq_fops = {
4127        .owner   = THIS_MODULE,
4128        .open    = softnet_seq_open,
4129        .read    = seq_read,
4130        .llseek  = seq_lseek,
4131        .release = seq_release,
4132};
4133
4134static void *ptype_get_idx(loff_t pos)
4135{
4136        struct packet_type *pt = NULL;
4137        loff_t i = 0;
4138        int t;
4139
4140        list_for_each_entry_rcu(pt, &ptype_all, list) {
4141                if (i == pos)
4142                        return pt;
4143                ++i;
4144        }
4145
4146        for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4147                list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4148                        if (i == pos)
4149                                return pt;
4150                        ++i;
4151                }
4152        }
4153        return NULL;
4154}
4155
4156static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4157        __acquires(RCU)
4158{
4159        rcu_read_lock();
4160        return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4161}
4162
4163static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4164{
4165        struct packet_type *pt;
4166        struct list_head *nxt;
4167        int hash;
4168
4169        ++*pos;
4170        if (v == SEQ_START_TOKEN)
4171                return ptype_get_idx(0);
4172
4173        pt = v;
4174        nxt = pt->list.next;
4175        if (pt->type == htons(ETH_P_ALL)) {
4176                if (nxt != &ptype_all)
4177                        goto found;
4178                hash = 0;
4179                nxt = ptype_base[0].next;
4180        } else
4181                hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4182
4183        while (nxt == &ptype_base[hash]) {
4184                if (++hash >= PTYPE_HASH_SIZE)
4185                        return NULL;
4186                nxt = ptype_base[hash].next;
4187        }
4188found:
4189        return list_entry(nxt, struct packet_type, list);
4190}
4191
4192static void ptype_seq_stop(struct seq_file *seq, void *v)
4193        __releases(RCU)
4194{
4195        rcu_read_unlock();
4196}
4197
4198static int ptype_seq_show(struct seq_file *seq, void *v)
4199{
4200        struct packet_type *pt = v;
4201
4202        if (v == SEQ_START_TOKEN)
4203                seq_puts(seq, "Type Device      Function\n");
4204        else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4205                if (pt->type == htons(ETH_P_ALL))
4206                        seq_puts(seq, "ALL ");
4207                else
4208                        seq_printf(seq, "%04x", ntohs(pt->type));
4209
4210                seq_printf(seq, " %-8s %pF\n",
4211                           pt->dev ? pt->dev->name : "", pt->func);
4212        }
4213
4214        return 0;
4215}
4216
4217static const struct seq_operations ptype_seq_ops = {
4218        .start = ptype_seq_start,
4219        .next  = ptype_seq_next,
4220        .stop  = ptype_seq_stop,
4221        .show  = ptype_seq_show,
4222};
4223
4224static int ptype_seq_open(struct inode *inode, struct file *file)
4225{
4226        return seq_open_net(inode, file, &ptype_seq_ops,
4227                        sizeof(struct seq_net_private));
4228}
4229
4230static const struct file_operations ptype_seq_fops = {
4231        .owner   = THIS_MODULE,
4232        .open    = ptype_seq_open,
4233        .read    = seq_read,
4234        .llseek  = seq_lseek,
4235        .release = seq_release_net,
4236};
4237
4238
4239static int __net_init dev_proc_net_init(struct net *net)
4240{
4241        int rc = -ENOMEM;
4242
4243        if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4244                goto out;
4245        if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4246                goto out_dev;
4247        if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4248                goto out_softnet;
4249
4250        if (wext_proc_init(net))
4251                goto out_ptype;
4252        rc = 0;
4253out:
4254        return rc;
4255out_ptype:
4256        proc_net_remove(net, "ptype");
4257out_softnet:
4258        proc_net_remove(net, "softnet_stat");
4259out_dev:
4260        proc_net_remove(net, "dev");
4261        goto out;
4262}
4263
4264static void __net_exit dev_proc_net_exit(struct net *net)
4265{
4266        wext_proc_exit(net);
4267
4268        proc_net_remove(net, "ptype");
4269        proc_net_remove(net, "softnet_stat");
4270        proc_net_remove(net, "dev");
4271}
4272
4273static struct pernet_operations __net_initdata dev_proc_ops = {
4274        .init = dev_proc_net_init,
4275        .exit = dev_proc_net_exit,
4276};
4277
4278static int __init dev_proc_init(void)
4279{
4280        return register_pernet_subsys(&dev_proc_ops);
4281}
4282#else
4283#define dev_proc_init() 0
4284#endif  /* CONFIG_PROC_FS */
4285
4286
4287/**
4288 *      netdev_set_master       -       set up master pointer
4289 *      @slave: slave device
4290 *      @master: new master device
4291 *
4292 *      Changes the master device of the slave. Pass %NULL to break the
4293 *      bonding. The caller must hold the RTNL semaphore. On a failure
4294 *      a negative errno code is returned. On success the reference counts
4295 *      are adjusted and the function returns zero.
4296 */
4297int netdev_set_master(struct net_device *slave, struct net_device *master)
4298{
4299        struct net_device *old = slave->master;
4300
4301        ASSERT_RTNL();
4302
4303        if (master) {
4304                if (old)
4305                        return -EBUSY;
4306                dev_hold(master);
4307        }
4308
4309        slave->master = master;
4310
4311        if (old) {
4312                synchronize_net();
4313                dev_put(old);
4314        }
4315        return 0;
4316}
4317EXPORT_SYMBOL(netdev_set_master);
4318
4319/**
4320 *      netdev_set_bond_master  -       set up bonding master/slave pair
4321 *      @slave: slave device
4322 *      @master: new master device
4323 *
4324 *      Changes the master device of the slave. Pass %NULL to break the
4325 *      bonding. The caller must hold the RTNL semaphore. On a failure
4326 *      a negative errno code is returned. On success %RTM_NEWLINK is sent
4327 *      to the routing socket and the function returns zero.
4328 */
4329int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4330{
4331        int err;
4332
4333        ASSERT_RTNL();
4334
4335        err = netdev_set_master(slave, master);
4336        if (err)
4337                return err;
4338        if (master)
4339                slave->flags |= IFF_SLAVE;
4340        else
4341                slave->flags &= ~IFF_SLAVE;
4342
4343        rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4344        return 0;
4345}
4346EXPORT_SYMBOL(netdev_set_bond_master);
4347
4348static void dev_change_rx_flags(struct net_device *dev, int flags)
4349{
4350        const struct net_device_ops *ops = dev->netdev_ops;
4351
4352        if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4353                ops->ndo_change_rx_flags(dev, flags);
4354}
4355
4356static int __dev_set_promiscuity(struct net_device *dev, int inc)
4357{
4358        unsigned short old_flags = dev->flags;
4359        uid_t uid;
4360        gid_t gid;
4361
4362        ASSERT_RTNL();
4363
4364        dev->flags |= IFF_PROMISC;
4365        dev->promiscuity += inc;
4366        if (dev->promiscuity == 0) {
4367                /*
4368                 * Avoid overflow.
4369                 * If inc causes overflow, untouch promisc and return error.
4370                 */
4371                if (inc < 0)
4372                        dev->flags &= ~IFF_PROMISC;
4373                else {
4374                        dev->promiscuity -= inc;
4375                        printk(KERN_WARNING "%s: promiscuity touches roof, "
4376                                "set promiscuity failed, promiscuity feature "
4377                                "of device might be broken.\n", dev->name);
4378                        return -EOVERFLOW;
4379                }
4380        }
4381        if (dev->flags != old_flags) {
4382                printk(KERN_INFO "device %s %s promiscuous mode\n",
4383                       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4384                                                               "left");
4385                if (audit_enabled) {
4386                        current_uid_gid(&uid, &gid);
4387                        audit_log(current->audit_context, GFP_ATOMIC,
4388                                AUDIT_ANOM_PROMISCUOUS,
4389                                "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4390                                dev->name, (dev->flags & IFF_PROMISC),
4391                                (old_flags & IFF_PROMISC),
4392                                audit_get_loginuid(current),
4393                                uid, gid,
4394                                audit_get_sessionid(current));
4395                }
4396
4397                dev_change_rx_flags(dev, IFF_PROMISC);
4398        }
4399        return 0;
4400}
4401
4402/**
4403 *      dev_set_promiscuity     - update promiscuity count on a device
4404 *      @dev: device
4405 *      @inc: modifier
4406 *
4407 *      Add or remove promiscuity from a device. While the count in the device
4408 *      remains above zero the interface remains promiscuous. Once it hits zero
4409 *      the device reverts back to normal filtering operation. A negative inc
4410 *      value is used to drop promiscuity on the device.
4411 *      Return 0 if successful or a negative errno code on error.
4412 */
4413int dev_set_promiscuity(struct net_device *dev, int inc)
4414{
4415        unsigned short old_flags = dev->flags;
4416        int err;
4417
4418        err = __dev_set_promiscuity(dev, inc);
4419        if (err < 0)
4420                return err;
4421        if (dev->flags != old_flags)
4422                dev_set_rx_mode(dev);
4423        return err;
4424}
4425EXPORT_SYMBOL(dev_set_promiscuity);
4426
4427/**
4428 *      dev_set_allmulti        - update allmulti count on a device
4429 *      @dev: device
4430 *      @inc: modifier
4431 *
4432 *      Add or remove reception of all multicast frames to a device. While the
4433 *      count in the device remains above zero the interface remains listening
4434 *      to all interfaces. Once it hits zero the device reverts back to normal
4435 *      filtering operation. A negative @inc value is used to drop the counter
4436 *      when releasing a resource needing all multicasts.
4437 *      Return 0 if successful or a negative errno code on error.
4438 */
4439
4440int dev_set_allmulti(struct net_device *dev, int inc)
4441{
4442        unsigned short old_flags = dev->flags;
4443
4444        ASSERT_RTNL();
4445
4446        dev->flags |= IFF_ALLMULTI;
4447        dev->allmulti += inc;
4448        if (dev->allmulti == 0) {
4449                /*
4450                 * Avoid overflow.
4451                 * If inc causes overflow, untouch allmulti and return error.
4452                 */
4453                if (inc < 0)
4454                        dev->flags &= ~IFF_ALLMULTI;
4455                else {
4456                        dev->allmulti -= inc;
4457                        printk(KERN_WARNING "%s: allmulti touches roof, "
4458                                "set allmulti failed, allmulti feature of "
4459                                "device might be broken.\n", dev->name);
4460                        return -EOVERFLOW;
4461                }
4462        }
4463        if (dev->flags ^ old_flags) {
4464                dev_change_rx_flags(dev, IFF_ALLMULTI);
4465                dev_set_rx_mode(dev);
4466        }
4467        return 0;
4468}
4469EXPORT_SYMBOL(dev_set_allmulti);
4470
4471/*
4472 *      Upload unicast and multicast address lists to device and
4473 *      configure RX filtering. When the device doesn't support unicast
4474 *      filtering it is put in promiscuous mode while unicast addresses
4475 *      are present.
4476 */
4477void __dev_set_rx_mode(struct net_device *dev)
4478{
4479        const struct net_device_ops *ops = dev->netdev_ops;
4480
4481        /* dev_open will call this function so the list will stay sane. */
4482        if (!(dev->flags&IFF_UP))
4483                return;
4484
4485        if (!netif_device_present(dev))
4486                return;
4487
4488        if (ops->ndo_set_rx_mode)
4489                ops->ndo_set_rx_mode(dev);
4490        else {
4491                /* Unicast addresses changes may only happen under the rtnl,
4492                 * therefore calling __dev_set_promiscuity here is safe.
4493                 */
4494                if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4495                        __dev_set_promiscuity(dev, 1);
4496                        dev->uc_promisc = 1;
4497                } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4498                        __dev_set_promiscuity(dev, -1);
4499                        dev->uc_promisc = 0;
4500                }
4501
4502                if (ops->ndo_set_multicast_list)
4503                        ops->ndo_set_multicast_list(dev);
4504        }
4505}
4506
4507void dev_set_rx_mode(struct net_device *dev)
4508{
4509        netif_addr_lock_bh(dev);
4510        __dev_set_rx_mode(dev);
4511        netif_addr_unlock_bh(dev);
4512}
4513
4514/**
4515 *      dev_get_flags - get flags reported to userspace
4516 *      @dev: device
4517 *
4518 *      Get the combination of flag bits exported through APIs to userspace.
4519 */
4520unsigned dev_get_flags(const struct net_device *dev)
4521{
4522        unsigned flags;
4523
4524        flags = (dev->flags & ~(IFF_PROMISC |
4525                                IFF_ALLMULTI |
4526                                IFF_RUNNING |
4527                                IFF_LOWER_UP |
4528                                IFF_DORMANT)) |
4529                (dev->gflags & (IFF_PROMISC |
4530                                IFF_ALLMULTI));
4531
4532        if (netif_running(dev)) {
4533                if (netif_oper_up(dev))
4534                        flags |= IFF_RUNNING;
4535                if (netif_carrier_ok(dev))
4536                        flags |= IFF_LOWER_UP;
4537                if (netif_dormant(dev))
4538                        flags |= IFF_DORMANT;
4539        }
4540
4541        return flags;
4542}
4543EXPORT_SYMBOL(dev_get_flags);
4544
4545int __dev_change_flags(struct net_device *dev, unsigned int flags)
4546{
4547        int old_flags = dev->flags;
4548        int ret;
4549
4550        ASSERT_RTNL();
4551
4552        /*
4553         *      Set the flags on our device.
4554         */
4555
4556        dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4557                               IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4558                               IFF_AUTOMEDIA)) |
4559                     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4560                                    IFF_ALLMULTI));
4561
4562        /*
4563         *      Load in the correct multicast list now the flags have changed.
4564         */
4565
4566        if ((old_flags ^ flags) & IFF_MULTICAST)
4567                dev_change_rx_flags(dev, IFF_MULTICAST);
4568
4569        dev_set_rx_mode(dev);
4570
4571        /*
4572         *      Have we downed the interface. We handle IFF_UP ourselves
4573         *      according to user attempts to set it, rather than blindly
4574         *      setting it.
4575         */
4576
4577        ret = 0;
4578        if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4579                ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4580
4581                if (!ret)
4582                        dev_set_rx_mode(dev);
4583        }
4584
4585        if ((flags ^ dev->gflags) & IFF_PROMISC) {
4586                int inc = (flags & IFF_PROMISC) ? 1 : -1;
4587
4588                dev->gflags ^= IFF_PROMISC;
4589                dev_set_promiscuity(dev, inc);
4590        }
4591
4592        /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4593           is important. Some (broken) drivers set IFF_PROMISC, when
4594           IFF_ALLMULTI is requested not asking us and not reporting.
4595         */
4596        if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4597                int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4598
4599                dev->gflags ^= IFF_ALLMULTI;
4600                dev_set_allmulti(dev, inc);
4601        }
4602
4603        return ret;
4604}
4605
4606void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4607{
4608        unsigned int changes = dev->flags ^ old_flags;
4609
4610        if (changes & IFF_UP) {
4611                if (dev->flags & IFF_UP)
4612                        call_netdevice_notifiers(NETDEV_UP, dev);
4613                else
4614                        call_netdevice_notifiers(NETDEV_DOWN, dev);
4615        }
4616
4617        if (dev->flags & IFF_UP &&
4618            (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4619                call_netdevice_notifiers(NETDEV_CHANGE, dev);
4620}
4621
4622/**
4623 *      dev_change_flags - change device settings
4624 *      @dev: device
4625 *      @flags: device state flags
4626 *
4627 *      Change settings on device based state flags. The flags are
4628 *      in the userspace exported format.
4629 */
4630int dev_change_flags(struct net_device *dev, unsigned flags)
4631{
4632        int ret, changes;
4633        int old_flags = dev->flags;
4634
4635        ret = __dev_change_flags(dev, flags);
4636        if (ret < 0)
4637                return ret;
4638
4639        changes = old_flags ^ dev->flags;
4640        if (changes)
4641                rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4642
4643        __dev_notify_flags(dev, old_flags);
4644        return ret;
4645}
4646EXPORT_SYMBOL(dev_change_flags);
4647
4648/**
4649 *      dev_set_mtu - Change maximum transfer unit
4650 *      @dev: device
4651 *      @new_mtu: new transfer unit
4652 *
4653 *      Change the maximum transfer size of the network device.
4654 */
4655int dev_set_mtu(struct net_device *dev, int new_mtu)
4656{
4657        const struct net_device_ops *ops = dev->netdev_ops;
4658        int err;
4659
4660        if (new_mtu == dev->mtu)
4661                return 0;
4662
4663        /*      MTU must be positive.    */
4664        if (new_mtu < 0)
4665                return -EINVAL;
4666
4667        if (!netif_device_present(dev))
4668                return -ENODEV;
4669
4670        err = 0;
4671        if (ops->ndo_change_mtu)
4672                err = ops->ndo_change_mtu(dev, new_mtu);
4673        else
4674                dev->mtu = new_mtu;
4675
4676        if (!err && dev->flags & IFF_UP)
4677                call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4678        return err;
4679}
4680EXPORT_SYMBOL(dev_set_mtu);
4681
4682/**
4683 *      dev_set_group - Change group this device belongs to
4684 *      @dev: device
4685 *      @new_group: group this device should belong to
4686 */
4687void dev_set_group(struct net_device *dev, int new_group)
4688{
4689        dev->group = new_group;
4690}
4691EXPORT_SYMBOL(dev_set_group);
4692
4693/**
4694 *      dev_set_mac_address - Change Media Access Control Address
4695 *      @dev: device
4696 *      @sa: new address
4697 *
4698 *      Change the hardware (MAC) address of the device
4699 */
4700int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4701{
4702        const struct net_device_ops *ops = dev->netdev_ops;
4703        int err;
4704
4705        if (!ops->ndo_set_mac_address)
4706                return -EOPNOTSUPP;
4707        if (sa->sa_family != dev->type)
4708                return -EINVAL;
4709        if (!netif_device_present(dev))
4710                return -ENODEV;
4711        err = ops->ndo_set_mac_address(dev, sa);
4712        if (!err)
4713                call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4714        return err;
4715}
4716EXPORT_SYMBOL(dev_set_mac_address);
4717
4718/*
4719 *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4720 */
4721static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4722{
4723        int err;
4724        struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4725
4726        if (!dev)
4727                return -ENODEV;
4728
4729        switch (cmd) {
4730        case SIOCGIFFLAGS:      /* Get interface flags */
4731                ifr->ifr_flags = (short) dev_get_flags(dev);
4732                return 0;
4733
4734        case SIOCGIFMETRIC:     /* Get the metric on the interface
4735                                   (currently unused) */
4736                ifr->ifr_metric = 0;
4737                return 0;
4738
4739        case SIOCGIFMTU:        /* Get the MTU of a device */
4740                ifr->ifr_mtu = dev->mtu;
4741                return 0;
4742
4743        case SIOCGIFHWADDR:
4744                if (!dev->addr_len)
4745                        memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4746                else
4747                        memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4748                               min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4749                ifr->ifr_hwaddr.sa_family = dev->type;
4750                return 0;
4751
4752        case SIOCGIFSLAVE:
4753                err = -EINVAL;
4754                break;
4755
4756        case SIOCGIFMAP:
4757                ifr->ifr_map.mem_start = dev->mem_start;
4758                ifr->ifr_map.mem_end   = dev->mem_end;
4759                ifr->ifr_map.base_addr = dev->base_addr;
4760                ifr->ifr_map.irq       = dev->irq;
4761                ifr->ifr_map.dma       = dev->dma;
4762                ifr->ifr_map.port      = dev->if_port;
4763                return 0;
4764
4765        case SIOCGIFINDEX:
4766                ifr->ifr_ifindex = dev->ifindex;
4767                return 0;
4768
4769        case SIOCGIFTXQLEN:
4770                ifr->ifr_qlen = dev->tx_queue_len;
4771                return 0;
4772
4773        default:
4774                /* dev_ioctl() should ensure this case
4775                 * is never reached
4776                 */
4777                WARN_ON(1);
4778                err = -ENOTTY;
4779                break;
4780
4781        }
4782        return err;
4783}
4784
4785/*
4786 *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4787 */
4788static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4789{
4790        int err;
4791        struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4792        const struct net_device_ops *ops;
4793
4794        if (!dev)
4795                return -ENODEV;
4796
4797        ops = dev->netdev_ops;
4798
4799        switch (cmd) {
4800        case SIOCSIFFLAGS:      /* Set interface flags */
4801                return dev_change_flags(dev, ifr->ifr_flags);
4802
4803        case SIOCSIFMETRIC:     /* Set the metric on the interface
4804                                   (currently unused) */
4805                return -EOPNOTSUPP;
4806
4807        case SIOCSIFMTU:        /* Set the MTU of a device */
4808                return dev_set_mtu(dev, ifr->ifr_mtu);
4809
4810        case SIOCSIFHWADDR:
4811                return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4812
4813        case SIOCSIFHWBROADCAST:
4814                if (ifr->ifr_hwaddr.sa_family != dev->type)
4815                        return -EINVAL;
4816                memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4817                       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4818                call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4819                return 0;
4820
4821        case SIOCSIFMAP:
4822                if (ops->ndo_set_config) {
4823                        if (!netif_device_present(dev))
4824                                return -ENODEV;
4825                        return ops->ndo_set_config(dev, &ifr->ifr_map);
4826                }
4827                return -EOPNOTSUPP;
4828
4829        case SIOCADDMULTI:
4830                if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4831                    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4832                        return -EINVAL;
4833                if (!netif_device_present(dev))
4834                        return -ENODEV;
4835                return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4836
4837        case SIOCDELMULTI:
4838                if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4839                    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4840                        return -EINVAL;
4841                if (!netif_device_present(dev))
4842                        return -ENODEV;
4843                return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4844
4845        case SIOCSIFTXQLEN:
4846                if (ifr->ifr_qlen < 0)
4847                        return -EINVAL;
4848                dev->tx_queue_len = ifr->ifr_qlen;
4849                return 0;
4850
4851        case SIOCSIFNAME:
4852                ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4853                return dev_change_name(dev, ifr->ifr_newname);
4854
4855        /*
4856         *      Unknown or private ioctl
4857         */
4858        default:
4859                if ((cmd >= SIOCDEVPRIVATE &&
4860                    cmd <= SIOCDEVPRIVATE + 15) ||
4861                    cmd == SIOCBONDENSLAVE ||
4862                    cmd == SIOCBONDRELEASE ||
4863                    cmd == SIOCBONDSETHWADDR ||
4864                    cmd == SIOCBONDSLAVEINFOQUERY ||
4865                    cmd == SIOCBONDINFOQUERY ||
4866                    cmd == SIOCBONDCHANGEACTIVE ||
4867                    cmd == SIOCGMIIPHY ||
4868                    cmd == SIOCGMIIREG ||
4869                    cmd == SIOCSMIIREG ||
4870                    cmd == SIOCBRADDIF ||
4871                    cmd == SIOCBRDELIF ||
4872                    cmd == SIOCSHWTSTAMP ||
4873                    cmd == SIOCWANDEV) {
4874                        err = -EOPNOTSUPP;
4875                        if (ops->ndo_do_ioctl) {
4876                                if (netif_device_present(dev))
4877                                        err = ops->ndo_do_ioctl(dev, ifr, cmd);
4878                                else
4879                                        err = -ENODEV;
4880                        }
4881                } else
4882                        err = -EINVAL;
4883
4884        }
4885        return err;
4886}
4887
4888/*
4889 *      This function handles all "interface"-type I/O control requests. The actual
4890 *      'doing' part of this is dev_ifsioc above.
4891 */
4892
4893/**
4894 *      dev_ioctl       -       network device ioctl
4895 *      @net: the applicable net namespace
4896 *      @cmd: command to issue
4897 *      @arg: pointer to a struct ifreq in user space
4898 *
4899 *      Issue ioctl functions to devices. This is normally called by the
4900 *      user space syscall interfaces but can sometimes be useful for
4901 *      other purposes. The return value is the return from the syscall if
4902 *      positive or a negative errno code on error.
4903 */
4904
4905int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4906{
4907        struct ifreq ifr;
4908        int ret;
4909        char *colon;
4910
4911        /* One special case: SIOCGIFCONF takes ifconf argument
4912           and requires shared lock, because it sleeps writing
4913           to user space.
4914         */
4915
4916        if (cmd == SIOCGIFCONF) {
4917                rtnl_lock();
4918                ret = dev_ifconf(net, (char __user *) arg);
4919                rtnl_unlock();
4920                return ret;
4921        }
4922        if (cmd == SIOCGIFNAME)
4923                return dev_ifname(net, (struct ifreq __user *)arg);
4924
4925        if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4926                return -EFAULT;
4927
4928        ifr.ifr_name[IFNAMSIZ-1] = 0;
4929
4930        colon = strchr(ifr.ifr_name, ':');
4931        if (colon)
4932                *colon = 0;
4933
4934        /*
4935         *      See which interface the caller is talking about.
4936         */
4937
4938        switch (cmd) {
4939        /*
4940         *      These ioctl calls:
4941         *      - can be done by all.
4942         *      - atomic and do not require locking.
4943         *      - return a value
4944         */
4945        case SIOCGIFFLAGS:
4946        case SIOCGIFMETRIC:
4947        case SIOCGIFMTU:
4948        case SIOCGIFHWADDR:
4949        case SIOCGIFSLAVE:
4950        case SIOCGIFMAP:
4951        case SIOCGIFINDEX:
4952        case SIOCGIFTXQLEN:
4953                dev_load(net, ifr.ifr_name);
4954                rcu_read_lock();
4955                ret = dev_ifsioc_locked(net, &ifr, cmd);
4956                rcu_read_unlock();
4957                if (!ret) {
4958                        if (colon)
4959                                *colon = ':';
4960                        if (copy_to_user(arg, &ifr,
4961                                         sizeof(struct ifreq)))
4962                                ret = -EFAULT;
4963                }
4964                return ret;
4965
4966        case SIOCETHTOOL:
4967                dev_load(net, ifr.ifr_name);
4968                rtnl_lock();
4969                ret = dev_ethtool(net, &ifr);
4970                rtnl_unlock();
4971                if (!ret) {
4972                        if (colon)
4973                                *colon = ':';
4974                        if (copy_to_user(arg, &ifr,
4975                                         sizeof(struct ifreq)))
4976                                ret = -EFAULT;
4977                }
4978                return ret;
4979
4980        /*
4981         *      These ioctl calls:
4982         *      - require superuser power.
4983         *      - require strict serialization.
4984         *      - return a value
4985         */
4986        case SIOCGMIIPHY:
4987        case SIOCGMIIREG:
4988        case SIOCSIFNAME:
4989                if (!capable(CAP_NET_ADMIN))
4990                        return -EPERM;
4991                dev_load(net, ifr.ifr_name);
4992                rtnl_lock();
4993                ret = dev_ifsioc(net, &ifr, cmd);
4994                rtnl_unlock();
4995                if (!ret) {
4996                        if (colon)
4997                                *colon = ':';
4998                        if (copy_to_user(arg, &ifr,
4999                                         sizeof(struct ifreq)))
5000                                ret = -EFAULT;
5001                }
5002                return ret;
5003
5004        /*
5005         *      These ioctl calls:
5006         *      - require superuser power.
5007         *      - require strict serialization.
5008         *      - do not return a value
5009         */
5010        case SIOCSIFFLAGS:
5011        case SIOCSIFMETRIC:
5012        case SIOCSIFMTU:
5013        case SIOCSIFMAP:
5014        case SIOCSIFHWADDR:
5015        case SIOCSIFSLAVE:
5016        case SIOCADDMULTI:
5017        case SIOCDELMULTI:
5018        case SIOCSIFHWBROADCAST:
5019        case SIOCSIFTXQLEN:
5020        case SIOCSMIIREG:
5021        case SIOCBONDENSLAVE:
5022        case SIOCBONDRELEASE:
5023        case SIOCBONDSETHWADDR:
5024        case SIOCBONDCHANGEACTIVE:
5025        case SIOCBRADDIF:
5026        case SIOCBRDELIF:
5027        case SIOCSHWTSTAMP:
5028                if (!capable(CAP_NET_ADMIN))
5029                        return -EPERM;
5030                /* fall through */
5031        case SIOCBONDSLAVEINFOQUERY:
5032        case SIOCBONDINFOQUERY:
5033                dev_load(net, ifr.ifr_name);
5034                rtnl_lock();
5035                ret = dev_ifsioc(net, &ifr, cmd);
5036                rtnl_unlock();
5037                return ret;
5038
5039        case SIOCGIFMEM:
5040                /* Get the per device memory space. We can add this but
5041                 * currently do not support it */
5042        case SIOCSIFMEM:
5043                /* Set the per device memory buffer space.
5044                 * Not applicable in our case */
5045        case SIOCSIFLINK:
5046                return -ENOTTY;
5047
5048        /*
5049         *      Unknown or private ioctl.
5050         */
5051        default:
5052                if (cmd == SIOCWANDEV ||
5053                    (cmd >= SIOCDEVPRIVATE &&
5054                     cmd <= SIOCDEVPRIVATE + 15)) {
5055                        dev_load(net, ifr.ifr_name);
5056                        rtnl_lock();
5057                        ret = dev_ifsioc(net, &ifr, cmd);
5058                        rtnl_unlock();
5059                        if (!ret && copy_to_user(arg, &ifr,
5060                                                 sizeof(struct ifreq)))
5061                                ret = -EFAULT;
5062                        return ret;
5063                }
5064                /* Take care of Wireless Extensions */
5065                if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5066                        return wext_handle_ioctl(net, &ifr, cmd, arg);
5067                return -ENOTTY;
5068        }
5069}
5070
5071
5072/**
5073 *      dev_new_index   -       allocate an ifindex
5074 *      @net: the applicable net namespace
5075 *
5076 *      Returns a suitable unique value for a new device interface
5077 *      number.  The caller must hold the rtnl semaphore or the
5078 *      dev_base_lock to be sure it remains unique.
5079 */
5080static int dev_new_index(struct net *net)
5081{
5082        static int ifindex;
5083        for (;;) {
5084                if (++ifindex <= 0)
5085                        ifindex = 1;
5086                if (!__dev_get_by_index(net, ifindex))
5087                        return ifindex;
5088        }
5089}
5090
5091/* Delayed registration/unregisteration */
5092static LIST_HEAD(net_todo_list);
5093
5094static void net_set_todo(struct net_device *dev)
5095{
5096        list_add_tail(&dev->todo_list, &net_todo_list);
5097}
5098
5099static void rollback_registered_many(struct list_head *head)
5100{
5101        struct net_device *dev, *tmp;
5102
5103        BUG_ON(dev_boot_phase);
5104        ASSERT_RTNL();
5105
5106        list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5107                /* Some devices call without registering
5108                 * for initialization unwind. Remove those
5109                 * devices and proceed with the remaining.
5110                 */
5111