linux/net/core/dev.c
<<
>>
Prefs
   1/*
   2 *      NET3    Protocol independent device support routines.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 *      Derived from the non IP parts of dev.c 1.0.19
  10 *              Authors:        Ross Biro
  11 *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *      Additional Authors:
  15 *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *              David Hinds <dahinds@users.sourceforge.net>
  18 *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *              Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *      Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *                                      to 2 if register_netdev gets called
  25 *                                      before net_dev_init & also removed a
  26 *                                      few lines of code in the process.
  27 *              Alan Cox        :       device private ioctl copies fields back.
  28 *              Alan Cox        :       Transmit queue code does relevant
  29 *                                      stunts to keep the queue safe.
  30 *              Alan Cox        :       Fixed double lock.
  31 *              Alan Cox        :       Fixed promisc NULL pointer trap
  32 *              ????????        :       Support the full private ioctl range
  33 *              Alan Cox        :       Moved ioctl permission check into
  34 *                                      drivers
  35 *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36 *              Alan Cox        :       100 backlog just doesn't cut it when
  37 *                                      you start doing multicast video 8)
  38 *              Alan Cox        :       Rewrote net_bh and list manager.
  39 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40 *              Alan Cox        :       Took out transmit every packet pass
  41 *                                      Saved a few bytes in the ioctl handler
  42 *              Alan Cox        :       Network driver sets packet type before
  43 *                                      calling netif_rx. Saves a function
  44 *                                      call a packet.
  45 *              Alan Cox        :       Hashed net_bh()
  46 *              Richard Kooijman:       Timestamp fixes.
  47 *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48 *              Alan Cox        :       Device lock protection.
  49 *              Alan Cox        :       Fixed nasty side effect of device close
  50 *                                      changes.
  51 *              Rudi Cilibrasi  :       Pass the right thing to
  52 *                                      set_mac_address()
  53 *              Dave Miller     :       32bit quantity for the device lock to
  54 *                                      make it work out on a Sparc.
  55 *              Bjorn Ekwall    :       Added KERNELD hack.
  56 *              Alan Cox        :       Cleaned up the backlog initialise.
  57 *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58 *                                      1 device.
  59 *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60 *                                      is no device open function.
  61 *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62 *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63 *              Cyrus Durgin    :       Cleaned for KMOD
  64 *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65 *                                      A network device unload needs to purge
  66 *                                      the backlog queue.
  67 *      Paul Rusty Russell      :       SIOCSIFNAME
  68 *              Pekka Riikonen  :       Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *                                      indefinitely on dev->refcnt
  71 *              J Hadi Salim    :       - Backlog queue sampling
  72 *                                      - netif_rx() feedback
  73 */
  74
  75#include <linux/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
  84#include <linux/mutex.h>
  85#include <linux/string.h>
  86#include <linux/mm.h>
  87#include <linux/socket.h>
  88#include <linux/sockios.h>
  89#include <linux/errno.h>
  90#include <linux/interrupt.h>
  91#include <linux/if_ether.h>
  92#include <linux/netdevice.h>
  93#include <linux/etherdevice.h>
  94#include <linux/ethtool.h>
  95#include <linux/notifier.h>
  96#include <linux/skbuff.h>
  97#include <linux/bpf.h>
  98#include <net/net_namespace.h>
  99#include <net/sock.h>
 100#include <net/busy_poll.h>
 101#include <linux/rtnetlink.h>
 102#include <linux/stat.h>
 103#include <net/dst.h>
 104#include <net/dst_metadata.h>
 105#include <net/pkt_sched.h>
 106#include <net/checksum.h>
 107#include <net/xfrm.h>
 108#include <linux/highmem.h>
 109#include <linux/init.h>
 110#include <linux/module.h>
 111#include <linux/netpoll.h>
 112#include <linux/rcupdate.h>
 113#include <linux/delay.h>
 114#include <net/iw_handler.h>
 115#include <asm/current.h>
 116#include <linux/audit.h>
 117#include <linux/dmaengine.h>
 118#include <linux/err.h>
 119#include <linux/ctype.h>
 120#include <linux/if_arp.h>
 121#include <linux/if_vlan.h>
 122#include <linux/ip.h>
 123#include <net/ip.h>
 124#include <net/mpls.h>
 125#include <linux/ipv6.h>
 126#include <linux/in.h>
 127#include <linux/jhash.h>
 128#include <linux/random.h>
 129#include <trace/events/napi.h>
 130#include <trace/events/net.h>
 131#include <trace/events/skb.h>
 132#include <linux/pci.h>
 133#include <linux/inetdevice.h>
 134#include <linux/cpu_rmap.h>
 135#include <linux/static_key.h>
 136#include <linux/hashtable.h>
 137#include <linux/vmalloc.h>
 138#include <linux/if_macvlan.h>
 139#include <linux/errqueue.h>
 140#include <linux/hrtimer.h>
 141#include <linux/netfilter_ingress.h>
 142#include <linux/crash_dump.h>
 143
 144#include "net-sysfs.h"
 145
 146/* Instead of increasing this, you should create a hash table. */
 147#define MAX_GRO_SKBS 8
 148
 149/* This should be increased if a protocol with a bigger head is added. */
 150#define GRO_MAX_HEAD (MAX_HEADER + 128)
 151
 152static DEFINE_SPINLOCK(ptype_lock);
 153static DEFINE_SPINLOCK(offload_lock);
 154struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 155struct list_head ptype_all __read_mostly;       /* Taps */
 156static struct list_head offload_base __read_mostly;
 157
 158static int netif_rx_internal(struct sk_buff *skb);
 159static int call_netdevice_notifiers_info(unsigned long val,
 160                                         struct net_device *dev,
 161                                         struct netdev_notifier_info *info);
 162
 163/*
 164 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 165 * semaphore.
 166 *
 167 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 168 *
 169 * Writers must hold the rtnl semaphore while they loop through the
 170 * dev_base_head list, and hold dev_base_lock for writing when they do the
 171 * actual updates.  This allows pure readers to access the list even
 172 * while a writer is preparing to update it.
 173 *
 174 * To put it another way, dev_base_lock is held for writing only to
 175 * protect against pure readers; the rtnl semaphore provides the
 176 * protection against other writers.
 177 *
 178 * See, for example usages, register_netdevice() and
 179 * unregister_netdevice(), which must be called with the rtnl
 180 * semaphore held.
 181 */
 182DEFINE_RWLOCK(dev_base_lock);
 183EXPORT_SYMBOL(dev_base_lock);
 184
 185/* protects napi_hash addition/deletion and napi_gen_id */
 186static DEFINE_SPINLOCK(napi_hash_lock);
 187
 188static unsigned int napi_gen_id = NR_CPUS;
 189static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 190
 191static seqcount_t devnet_rename_seq;
 192
 193static inline void dev_base_seq_inc(struct net *net)
 194{
 195        while (++net->dev_base_seq == 0);
 196}
 197
 198static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 199{
 200        unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
 201
 202        return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 203}
 204
 205static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 206{
 207        return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 208}
 209
 210static inline void rps_lock(struct softnet_data *sd)
 211{
 212#ifdef CONFIG_RPS
 213        spin_lock(&sd->input_pkt_queue.lock);
 214#endif
 215}
 216
 217static inline void rps_unlock(struct softnet_data *sd)
 218{
 219#ifdef CONFIG_RPS
 220        spin_unlock(&sd->input_pkt_queue.lock);
 221#endif
 222}
 223
 224/* Device list insertion */
 225static void list_netdevice(struct net_device *dev)
 226{
 227        struct net *net = dev_net(dev);
 228
 229        ASSERT_RTNL();
 230
 231        write_lock_bh(&dev_base_lock);
 232        list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 233        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 234        hlist_add_head_rcu(&dev->index_hlist,
 235                           dev_index_hash(net, dev->ifindex));
 236        write_unlock_bh(&dev_base_lock);
 237
 238        dev_base_seq_inc(net);
 239}
 240
 241/* Device list removal
 242 * caller must respect a RCU grace period before freeing/reusing dev
 243 */
 244static void unlist_netdevice(struct net_device *dev)
 245{
 246        ASSERT_RTNL();
 247
 248        /* Unlink dev from the device chain */
 249        write_lock_bh(&dev_base_lock);
 250        list_del_rcu(&dev->dev_list);
 251        hlist_del_rcu(&dev->name_hlist);
 252        hlist_del_rcu(&dev->index_hlist);
 253        write_unlock_bh(&dev_base_lock);
 254
 255        dev_base_seq_inc(dev_net(dev));
 256}
 257
 258/*
 259 *      Our notifier list
 260 */
 261
 262static RAW_NOTIFIER_HEAD(netdev_chain);
 263
 264/*
 265 *      Device drivers call our routines to queue packets here. We empty the
 266 *      queue in the local softnet handler.
 267 */
 268
 269DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 270EXPORT_PER_CPU_SYMBOL(softnet_data);
 271
 272#ifdef CONFIG_LOCKDEP
 273/*
 274 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 275 * according to dev->type
 276 */
 277static const unsigned short netdev_lock_type[] =
 278        {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 279         ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 280         ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 281         ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 282         ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 283         ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 284         ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 285         ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 286         ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 287         ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 288         ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 289         ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 290         ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 291         ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 292         ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 293
 294static const char *const netdev_lock_name[] =
 295        {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 296         "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 297         "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 298         "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 299         "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 300         "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 301         "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 302         "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 303         "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 304         "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 305         "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 306         "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 307         "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 308         "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 309         "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 310
 311static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 312static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 313
 314static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 315{
 316        int i;
 317
 318        for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 319                if (netdev_lock_type[i] == dev_type)
 320                        return i;
 321        /* the last key is used by default */
 322        return ARRAY_SIZE(netdev_lock_type) - 1;
 323}
 324
 325static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 326                                                 unsigned short dev_type)
 327{
 328        int i;
 329
 330        i = netdev_lock_pos(dev_type);
 331        lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 332                                   netdev_lock_name[i]);
 333}
 334
 335static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 336{
 337        int i;
 338
 339        i = netdev_lock_pos(dev->type);
 340        lockdep_set_class_and_name(&dev->addr_list_lock,
 341                                   &netdev_addr_lock_key[i],
 342                                   netdev_lock_name[i]);
 343}
 344#else
 345static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 346                                                 unsigned short dev_type)
 347{
 348}
 349static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 350{
 351}
 352#endif
 353
 354/*******************************************************************************
 355
 356                Protocol management and registration routines
 357
 358*******************************************************************************/
 359
 360/*
 361 *      Add a protocol ID to the list. Now that the input handler is
 362 *      smarter we can dispense with all the messy stuff that used to be
 363 *      here.
 364 *
 365 *      BEWARE!!! Protocol handlers, mangling input packets,
 366 *      MUST BE last in hash buckets and checking protocol handlers
 367 *      MUST start from promiscuous ptype_all chain in net_bh.
 368 *      It is true now, do not change it.
 369 *      Explanation follows: if protocol handler, mangling packet, will
 370 *      be the first on list, it is not able to sense, that packet
 371 *      is cloned and should be copied-on-write, so that it will
 372 *      change it and subsequent readers will get broken packet.
 373 *                                                      --ANK (980803)
 374 */
 375
 376static inline struct list_head *ptype_head(const struct packet_type *pt)
 377{
 378        if (pt->type == htons(ETH_P_ALL))
 379                return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 380        else
 381                return pt->dev ? &pt->dev->ptype_specific :
 382                                 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 383}
 384
 385/**
 386 *      dev_add_pack - add packet handler
 387 *      @pt: packet type declaration
 388 *
 389 *      Add a protocol handler to the networking stack. The passed &packet_type
 390 *      is linked into kernel lists and may not be freed until it has been
 391 *      removed from the kernel lists.
 392 *
 393 *      This call does not sleep therefore it can not
 394 *      guarantee all CPU's that are in middle of receiving packets
 395 *      will see the new packet type (until the next received packet).
 396 */
 397
 398void dev_add_pack(struct packet_type *pt)
 399{
 400        struct list_head *head = ptype_head(pt);
 401
 402        spin_lock(&ptype_lock);
 403        list_add_rcu(&pt->list, head);
 404        spin_unlock(&ptype_lock);
 405}
 406EXPORT_SYMBOL(dev_add_pack);
 407
 408/**
 409 *      __dev_remove_pack        - remove packet handler
 410 *      @pt: packet type declaration
 411 *
 412 *      Remove a protocol handler that was previously added to the kernel
 413 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 414 *      from the kernel lists and can be freed or reused once this function
 415 *      returns.
 416 *
 417 *      The packet type might still be in use by receivers
 418 *      and must not be freed until after all the CPU's have gone
 419 *      through a quiescent state.
 420 */
 421void __dev_remove_pack(struct packet_type *pt)
 422{
 423        struct list_head *head = ptype_head(pt);
 424        struct packet_type *pt1;
 425
 426        spin_lock(&ptype_lock);
 427
 428        list_for_each_entry(pt1, head, list) {
 429                if (pt == pt1) {
 430                        list_del_rcu(&pt->list);
 431                        goto out;
 432                }
 433        }
 434
 435        pr_warn("dev_remove_pack: %p not found\n", pt);
 436out:
 437        spin_unlock(&ptype_lock);
 438}
 439EXPORT_SYMBOL(__dev_remove_pack);
 440
 441/**
 442 *      dev_remove_pack  - remove packet handler
 443 *      @pt: packet type declaration
 444 *
 445 *      Remove a protocol handler that was previously added to the kernel
 446 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 447 *      from the kernel lists and can be freed or reused once this function
 448 *      returns.
 449 *
 450 *      This call sleeps to guarantee that no CPU is looking at the packet
 451 *      type after return.
 452 */
 453void dev_remove_pack(struct packet_type *pt)
 454{
 455        __dev_remove_pack(pt);
 456
 457        synchronize_net();
 458}
 459EXPORT_SYMBOL(dev_remove_pack);
 460
 461
 462/**
 463 *      dev_add_offload - register offload handlers
 464 *      @po: protocol offload declaration
 465 *
 466 *      Add protocol offload handlers to the networking stack. The passed
 467 *      &proto_offload is linked into kernel lists and may not be freed until
 468 *      it has been removed from the kernel lists.
 469 *
 470 *      This call does not sleep therefore it can not
 471 *      guarantee all CPU's that are in middle of receiving packets
 472 *      will see the new offload handlers (until the next received packet).
 473 */
 474void dev_add_offload(struct packet_offload *po)
 475{
 476        struct packet_offload *elem;
 477
 478        spin_lock(&offload_lock);
 479        list_for_each_entry(elem, &offload_base, list) {
 480                if (po->priority < elem->priority)
 481                        break;
 482        }
 483        list_add_rcu(&po->list, elem->list.prev);
 484        spin_unlock(&offload_lock);
 485}
 486EXPORT_SYMBOL(dev_add_offload);
 487
 488/**
 489 *      __dev_remove_offload     - remove offload handler
 490 *      @po: packet offload declaration
 491 *
 492 *      Remove a protocol offload handler that was previously added to the
 493 *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 494 *      is removed from the kernel lists and can be freed or reused once this
 495 *      function returns.
 496 *
 497 *      The packet type might still be in use by receivers
 498 *      and must not be freed until after all the CPU's have gone
 499 *      through a quiescent state.
 500 */
 501static void __dev_remove_offload(struct packet_offload *po)
 502{
 503        struct list_head *head = &offload_base;
 504        struct packet_offload *po1;
 505
 506        spin_lock(&offload_lock);
 507
 508        list_for_each_entry(po1, head, list) {
 509                if (po == po1) {
 510                        list_del_rcu(&po->list);
 511                        goto out;
 512                }
 513        }
 514
 515        pr_warn("dev_remove_offload: %p not found\n", po);
 516out:
 517        spin_unlock(&offload_lock);
 518}
 519
 520/**
 521 *      dev_remove_offload       - remove packet offload handler
 522 *      @po: packet offload declaration
 523 *
 524 *      Remove a packet offload handler that was previously added to the kernel
 525 *      offload handlers by dev_add_offload(). The passed &offload_type is
 526 *      removed from the kernel lists and can be freed or reused once this
 527 *      function returns.
 528 *
 529 *      This call sleeps to guarantee that no CPU is looking at the packet
 530 *      type after return.
 531 */
 532void dev_remove_offload(struct packet_offload *po)
 533{
 534        __dev_remove_offload(po);
 535
 536        synchronize_net();
 537}
 538EXPORT_SYMBOL(dev_remove_offload);
 539
 540/******************************************************************************
 541
 542                      Device Boot-time Settings Routines
 543
 544*******************************************************************************/
 545
 546/* Boot time configuration table */
 547static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 548
 549/**
 550 *      netdev_boot_setup_add   - add new setup entry
 551 *      @name: name of the device
 552 *      @map: configured settings for the device
 553 *
 554 *      Adds new setup entry to the dev_boot_setup list.  The function
 555 *      returns 0 on error and 1 on success.  This is a generic routine to
 556 *      all netdevices.
 557 */
 558static int netdev_boot_setup_add(char *name, struct ifmap *map)
 559{
 560        struct netdev_boot_setup *s;
 561        int i;
 562
 563        s = dev_boot_setup;
 564        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 565                if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 566                        memset(s[i].name, 0, sizeof(s[i].name));
 567                        strlcpy(s[i].name, name, IFNAMSIZ);
 568                        memcpy(&s[i].map, map, sizeof(s[i].map));
 569                        break;
 570                }
 571        }
 572
 573        return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 574}
 575
 576/**
 577 *      netdev_boot_setup_check - check boot time settings
 578 *      @dev: the netdevice
 579 *
 580 *      Check boot time settings for the device.
 581 *      The found settings are set for the device to be used
 582 *      later in the device probing.
 583 *      Returns 0 if no settings found, 1 if they are.
 584 */
 585int netdev_boot_setup_check(struct net_device *dev)
 586{
 587        struct netdev_boot_setup *s = dev_boot_setup;
 588        int i;
 589
 590        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 591                if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 592                    !strcmp(dev->name, s[i].name)) {
 593                        dev->irq        = s[i].map.irq;
 594                        dev->base_addr  = s[i].map.base_addr;
 595                        dev->mem_start  = s[i].map.mem_start;
 596                        dev->mem_end    = s[i].map.mem_end;
 597                        return 1;
 598                }
 599        }
 600        return 0;
 601}
 602EXPORT_SYMBOL(netdev_boot_setup_check);
 603
 604
 605/**
 606 *      netdev_boot_base        - get address from boot time settings
 607 *      @prefix: prefix for network device
 608 *      @unit: id for network device
 609 *
 610 *      Check boot time settings for the base address of device.
 611 *      The found settings are set for the device to be used
 612 *      later in the device probing.
 613 *      Returns 0 if no settings found.
 614 */
 615unsigned long netdev_boot_base(const char *prefix, int unit)
 616{
 617        const struct netdev_boot_setup *s = dev_boot_setup;
 618        char name[IFNAMSIZ];
 619        int i;
 620
 621        sprintf(name, "%s%d", prefix, unit);
 622
 623        /*
 624         * If device already registered then return base of 1
 625         * to indicate not to probe for this interface
 626         */
 627        if (__dev_get_by_name(&init_net, name))
 628                return 1;
 629
 630        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 631                if (!strcmp(name, s[i].name))
 632                        return s[i].map.base_addr;
 633        return 0;
 634}
 635
 636/*
 637 * Saves at boot time configured settings for any netdevice.
 638 */
 639int __init netdev_boot_setup(char *str)
 640{
 641        int ints[5];
 642        struct ifmap map;
 643
 644        str = get_options(str, ARRAY_SIZE(ints), ints);
 645        if (!str || !*str)
 646                return 0;
 647
 648        /* Save settings */
 649        memset(&map, 0, sizeof(map));
 650        if (ints[0] > 0)
 651                map.irq = ints[1];
 652        if (ints[0] > 1)
 653                map.base_addr = ints[2];
 654        if (ints[0] > 2)
 655                map.mem_start = ints[3];
 656        if (ints[0] > 3)
 657                map.mem_end = ints[4];
 658
 659        /* Add new entry to the list */
 660        return netdev_boot_setup_add(str, &map);
 661}
 662
 663__setup("netdev=", netdev_boot_setup);
 664
 665/*******************************************************************************
 666
 667                            Device Interface Subroutines
 668
 669*******************************************************************************/
 670
 671/**
 672 *      dev_get_iflink  - get 'iflink' value of a interface
 673 *      @dev: targeted interface
 674 *
 675 *      Indicates the ifindex the interface is linked to.
 676 *      Physical interfaces have the same 'ifindex' and 'iflink' values.
 677 */
 678
 679int dev_get_iflink(const struct net_device *dev)
 680{
 681        if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 682                return dev->netdev_ops->ndo_get_iflink(dev);
 683
 684        return dev->ifindex;
 685}
 686EXPORT_SYMBOL(dev_get_iflink);
 687
 688/**
 689 *      dev_fill_metadata_dst - Retrieve tunnel egress information.
 690 *      @dev: targeted interface
 691 *      @skb: The packet.
 692 *
 693 *      For better visibility of tunnel traffic OVS needs to retrieve
 694 *      egress tunnel information for a packet. Following API allows
 695 *      user to get this info.
 696 */
 697int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 698{
 699        struct ip_tunnel_info *info;
 700
 701        if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 702                return -EINVAL;
 703
 704        info = skb_tunnel_info_unclone(skb);
 705        if (!info)
 706                return -ENOMEM;
 707        if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 708                return -EINVAL;
 709
 710        return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 711}
 712EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 713
 714/**
 715 *      __dev_get_by_name       - find a device by its name
 716 *      @net: the applicable net namespace
 717 *      @name: name to find
 718 *
 719 *      Find an interface by name. Must be called under RTNL semaphore
 720 *      or @dev_base_lock. If the name is found a pointer to the device
 721 *      is returned. If the name is not found then %NULL is returned. The
 722 *      reference counters are not incremented so the caller must be
 723 *      careful with locks.
 724 */
 725
 726struct net_device *__dev_get_by_name(struct net *net, const char *name)
 727{
 728        struct net_device *dev;
 729        struct hlist_head *head = dev_name_hash(net, name);
 730
 731        hlist_for_each_entry(dev, head, name_hlist)
 732                if (!strncmp(dev->name, name, IFNAMSIZ))
 733                        return dev;
 734
 735        return NULL;
 736}
 737EXPORT_SYMBOL(__dev_get_by_name);
 738
 739/**
 740 *      dev_get_by_name_rcu     - find a device by its name
 741 *      @net: the applicable net namespace
 742 *      @name: name to find
 743 *
 744 *      Find an interface by name.
 745 *      If the name is found a pointer to the device is returned.
 746 *      If the name is not found then %NULL is returned.
 747 *      The reference counters are not incremented so the caller must be
 748 *      careful with locks. The caller must hold RCU lock.
 749 */
 750
 751struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 752{
 753        struct net_device *dev;
 754        struct hlist_head *head = dev_name_hash(net, name);
 755
 756        hlist_for_each_entry_rcu(dev, head, name_hlist)
 757                if (!strncmp(dev->name, name, IFNAMSIZ))
 758                        return dev;
 759
 760        return NULL;
 761}
 762EXPORT_SYMBOL(dev_get_by_name_rcu);
 763
 764/**
 765 *      dev_get_by_name         - find a device by its name
 766 *      @net: the applicable net namespace
 767 *      @name: name to find
 768 *
 769 *      Find an interface by name. This can be called from any
 770 *      context and does its own locking. The returned handle has
 771 *      the usage count incremented and the caller must use dev_put() to
 772 *      release it when it is no longer needed. %NULL is returned if no
 773 *      matching device is found.
 774 */
 775
 776struct net_device *dev_get_by_name(struct net *net, const char *name)
 777{
 778        struct net_device *dev;
 779
 780        rcu_read_lock();
 781        dev = dev_get_by_name_rcu(net, name);
 782        if (dev)
 783                dev_hold(dev);
 784        rcu_read_unlock();
 785        return dev;
 786}
 787EXPORT_SYMBOL(dev_get_by_name);
 788
 789/**
 790 *      __dev_get_by_index - find a device by its ifindex
 791 *      @net: the applicable net namespace
 792 *      @ifindex: index of device
 793 *
 794 *      Search for an interface by index. Returns %NULL if the device
 795 *      is not found or a pointer to the device. The device has not
 796 *      had its reference counter increased so the caller must be careful
 797 *      about locking. The caller must hold either the RTNL semaphore
 798 *      or @dev_base_lock.
 799 */
 800
 801struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 802{
 803        struct net_device *dev;
 804        struct hlist_head *head = dev_index_hash(net, ifindex);
 805
 806        hlist_for_each_entry(dev, head, index_hlist)
 807                if (dev->ifindex == ifindex)
 808                        return dev;
 809
 810        return NULL;
 811}
 812EXPORT_SYMBOL(__dev_get_by_index);
 813
 814/**
 815 *      dev_get_by_index_rcu - find a device by its ifindex
 816 *      @net: the applicable net namespace
 817 *      @ifindex: index of device
 818 *
 819 *      Search for an interface by index. Returns %NULL if the device
 820 *      is not found or a pointer to the device. The device has not
 821 *      had its reference counter increased so the caller must be careful
 822 *      about locking. The caller must hold RCU lock.
 823 */
 824
 825struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 826{
 827        struct net_device *dev;
 828        struct hlist_head *head = dev_index_hash(net, ifindex);
 829
 830        hlist_for_each_entry_rcu(dev, head, index_hlist)
 831                if (dev->ifindex == ifindex)
 832                        return dev;
 833
 834        return NULL;
 835}
 836EXPORT_SYMBOL(dev_get_by_index_rcu);
 837
 838
 839/**
 840 *      dev_get_by_index - find a device by its ifindex
 841 *      @net: the applicable net namespace
 842 *      @ifindex: index of device
 843 *
 844 *      Search for an interface by index. Returns NULL if the device
 845 *      is not found or a pointer to the device. The device returned has
 846 *      had a reference added and the pointer is safe until the user calls
 847 *      dev_put to indicate they have finished with it.
 848 */
 849
 850struct net_device *dev_get_by_index(struct net *net, int ifindex)
 851{
 852        struct net_device *dev;
 853
 854        rcu_read_lock();
 855        dev = dev_get_by_index_rcu(net, ifindex);
 856        if (dev)
 857                dev_hold(dev);
 858        rcu_read_unlock();
 859        return dev;
 860}
 861EXPORT_SYMBOL(dev_get_by_index);
 862
 863/**
 864 *      netdev_get_name - get a netdevice name, knowing its ifindex.
 865 *      @net: network namespace
 866 *      @name: a pointer to the buffer where the name will be stored.
 867 *      @ifindex: the ifindex of the interface to get the name from.
 868 *
 869 *      The use of raw_seqcount_begin() and cond_resched() before
 870 *      retrying is required as we want to give the writers a chance
 871 *      to complete when CONFIG_PREEMPT is not set.
 872 */
 873int netdev_get_name(struct net *net, char *name, int ifindex)
 874{
 875        struct net_device *dev;
 876        unsigned int seq;
 877
 878retry:
 879        seq = raw_seqcount_begin(&devnet_rename_seq);
 880        rcu_read_lock();
 881        dev = dev_get_by_index_rcu(net, ifindex);
 882        if (!dev) {
 883                rcu_read_unlock();
 884                return -ENODEV;
 885        }
 886
 887        strcpy(name, dev->name);
 888        rcu_read_unlock();
 889        if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 890                cond_resched();
 891                goto retry;
 892        }
 893
 894        return 0;
 895}
 896
 897/**
 898 *      dev_getbyhwaddr_rcu - find a device by its hardware address
 899 *      @net: the applicable net namespace
 900 *      @type: media type of device
 901 *      @ha: hardware address
 902 *
 903 *      Search for an interface by MAC address. Returns NULL if the device
 904 *      is not found or a pointer to the device.
 905 *      The caller must hold RCU or RTNL.
 906 *      The returned device has not had its ref count increased
 907 *      and the caller must therefore be careful about locking
 908 *
 909 */
 910
 911struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 912                                       const char *ha)
 913{
 914        struct net_device *dev;
 915
 916        for_each_netdev_rcu(net, dev)
 917                if (dev->type == type &&
 918                    !memcmp(dev->dev_addr, ha, dev->addr_len))
 919                        return dev;
 920
 921        return NULL;
 922}
 923EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 924
 925struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 926{
 927        struct net_device *dev;
 928
 929        ASSERT_RTNL();
 930        for_each_netdev(net, dev)
 931                if (dev->type == type)
 932                        return dev;
 933
 934        return NULL;
 935}
 936EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 937
 938struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 939{
 940        struct net_device *dev, *ret = NULL;
 941
 942        rcu_read_lock();
 943        for_each_netdev_rcu(net, dev)
 944                if (dev->type == type) {
 945                        dev_hold(dev);
 946                        ret = dev;
 947                        break;
 948                }
 949        rcu_read_unlock();
 950        return ret;
 951}
 952EXPORT_SYMBOL(dev_getfirstbyhwtype);
 953
 954/**
 955 *      __dev_get_by_flags - find any device with given flags
 956 *      @net: the applicable net namespace
 957 *      @if_flags: IFF_* values
 958 *      @mask: bitmask of bits in if_flags to check
 959 *
 960 *      Search for any interface with the given flags. Returns NULL if a device
 961 *      is not found or a pointer to the device. Must be called inside
 962 *      rtnl_lock(), and result refcount is unchanged.
 963 */
 964
 965struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 966                                      unsigned short mask)
 967{
 968        struct net_device *dev, *ret;
 969
 970        ASSERT_RTNL();
 971
 972        ret = NULL;
 973        for_each_netdev(net, dev) {
 974                if (((dev->flags ^ if_flags) & mask) == 0) {
 975                        ret = dev;
 976                        break;
 977                }
 978        }
 979        return ret;
 980}
 981EXPORT_SYMBOL(__dev_get_by_flags);
 982
 983/**
 984 *      dev_valid_name - check if name is okay for network device
 985 *      @name: name string
 986 *
 987 *      Network device names need to be valid file names to
 988 *      to allow sysfs to work.  We also disallow any kind of
 989 *      whitespace.
 990 */
 991bool dev_valid_name(const char *name)
 992{
 993        if (*name == '\0')
 994                return false;
 995        if (strlen(name) >= IFNAMSIZ)
 996                return false;
 997        if (!strcmp(name, ".") || !strcmp(name, ".."))
 998                return false;
 999
1000        while (*name) {
1001                if (*name == '/' || *name == ':' || isspace(*name))
1002                        return false;
1003                name++;
1004        }
1005        return true;
1006}
1007EXPORT_SYMBOL(dev_valid_name);
1008
1009/**
1010 *      __dev_alloc_name - allocate a name for a device
1011 *      @net: network namespace to allocate the device name in
1012 *      @name: name format string
1013 *      @buf:  scratch buffer and result name string
1014 *
1015 *      Passed a format string - eg "lt%d" it will try and find a suitable
1016 *      id. It scans list of devices to build up a free map, then chooses
1017 *      the first empty slot. The caller must hold the dev_base or rtnl lock
1018 *      while allocating the name and adding the device in order to avoid
1019 *      duplicates.
1020 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021 *      Returns the number of the unit assigned or a negative errno code.
1022 */
1023
1024static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1025{
1026        int i = 0;
1027        const char *p;
1028        const int max_netdevices = 8*PAGE_SIZE;
1029        unsigned long *inuse;
1030        struct net_device *d;
1031
1032        p = strnchr(name, IFNAMSIZ-1, '%');
1033        if (p) {
1034                /*
1035                 * Verify the string as this thing may have come from
1036                 * the user.  There must be either one "%d" and no other "%"
1037                 * characters.
1038                 */
1039                if (p[1] != 'd' || strchr(p + 2, '%'))
1040                        return -EINVAL;
1041
1042                /* Use one page as a bit array of possible slots */
1043                inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1044                if (!inuse)
1045                        return -ENOMEM;
1046
1047                for_each_netdev(net, d) {
1048                        if (!sscanf(d->name, name, &i))
1049                                continue;
1050                        if (i < 0 || i >= max_netdevices)
1051                                continue;
1052
1053                        /*  avoid cases where sscanf is not exact inverse of printf */
1054                        snprintf(buf, IFNAMSIZ, name, i);
1055                        if (!strncmp(buf, d->name, IFNAMSIZ))
1056                                set_bit(i, inuse);
1057                }
1058
1059                i = find_first_zero_bit(inuse, max_netdevices);
1060                free_page((unsigned long) inuse);
1061        }
1062
1063        if (buf != name)
1064                snprintf(buf, IFNAMSIZ, name, i);
1065        if (!__dev_get_by_name(net, buf))
1066                return i;
1067
1068        /* It is possible to run out of possible slots
1069         * when the name is long and there isn't enough space left
1070         * for the digits, or if all bits are used.
1071         */
1072        return -ENFILE;
1073}
1074
1075/**
1076 *      dev_alloc_name - allocate a name for a device
1077 *      @dev: device
1078 *      @name: name format string
1079 *
1080 *      Passed a format string - eg "lt%d" it will try and find a suitable
1081 *      id. It scans list of devices to build up a free map, then chooses
1082 *      the first empty slot. The caller must hold the dev_base or rtnl lock
1083 *      while allocating the name and adding the device in order to avoid
1084 *      duplicates.
1085 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1086 *      Returns the number of the unit assigned or a negative errno code.
1087 */
1088
1089int dev_alloc_name(struct net_device *dev, const char *name)
1090{
1091        char buf[IFNAMSIZ];
1092        struct net *net;
1093        int ret;
1094
1095        BUG_ON(!dev_net(dev));
1096        net = dev_net(dev);
1097        ret = __dev_alloc_name(net, name, buf);
1098        if (ret >= 0)
1099                strlcpy(dev->name, buf, IFNAMSIZ);
1100        return ret;
1101}
1102EXPORT_SYMBOL(dev_alloc_name);
1103
1104static int dev_alloc_name_ns(struct net *net,
1105                             struct net_device *dev,
1106                             const char *name)
1107{
1108        char buf[IFNAMSIZ];
1109        int ret;
1110
1111        ret = __dev_alloc_name(net, name, buf);
1112        if (ret >= 0)
1113                strlcpy(dev->name, buf, IFNAMSIZ);
1114        return ret;
1115}
1116
1117static int dev_get_valid_name(struct net *net,
1118                              struct net_device *dev,
1119                              const char *name)
1120{
1121        BUG_ON(!net);
1122
1123        if (!dev_valid_name(name))
1124                return -EINVAL;
1125
1126        if (strchr(name, '%'))
1127                return dev_alloc_name_ns(net, dev, name);
1128        else if (__dev_get_by_name(net, name))
1129                return -EEXIST;
1130        else if (dev->name != name)
1131                strlcpy(dev->name, name, IFNAMSIZ);
1132
1133        return 0;
1134}
1135
1136/**
1137 *      dev_change_name - change name of a device
1138 *      @dev: device
1139 *      @newname: name (or format string) must be at least IFNAMSIZ
1140 *
1141 *      Change name of a device, can pass format strings "eth%d".
1142 *      for wildcarding.
1143 */
1144int dev_change_name(struct net_device *dev, const char *newname)
1145{
1146        unsigned char old_assign_type;
1147        char oldname[IFNAMSIZ];
1148        int err = 0;
1149        int ret;
1150        struct net *net;
1151
1152        ASSERT_RTNL();
1153        BUG_ON(!dev_net(dev));
1154
1155        net = dev_net(dev);
1156        if (dev->flags & IFF_UP)
1157                return -EBUSY;
1158
1159        write_seqcount_begin(&devnet_rename_seq);
1160
1161        if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1162                write_seqcount_end(&devnet_rename_seq);
1163                return 0;
1164        }
1165
1166        memcpy(oldname, dev->name, IFNAMSIZ);
1167
1168        err = dev_get_valid_name(net, dev, newname);
1169        if (err < 0) {
1170                write_seqcount_end(&devnet_rename_seq);
1171                return err;
1172        }
1173
1174        if (oldname[0] && !strchr(oldname, '%'))
1175                netdev_info(dev, "renamed from %s\n", oldname);
1176
1177        old_assign_type = dev->name_assign_type;
1178        dev->name_assign_type = NET_NAME_RENAMED;
1179
1180rollback:
1181        ret = device_rename(&dev->dev, dev->name);
1182        if (ret) {
1183                memcpy(dev->name, oldname, IFNAMSIZ);
1184                dev->name_assign_type = old_assign_type;
1185                write_seqcount_end(&devnet_rename_seq);
1186                return ret;
1187        }
1188
1189        write_seqcount_end(&devnet_rename_seq);
1190
1191        netdev_adjacent_rename_links(dev, oldname);
1192
1193        write_lock_bh(&dev_base_lock);
1194        hlist_del_rcu(&dev->name_hlist);
1195        write_unlock_bh(&dev_base_lock);
1196
1197        synchronize_rcu();
1198
1199        write_lock_bh(&dev_base_lock);
1200        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1201        write_unlock_bh(&dev_base_lock);
1202
1203        ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1204        ret = notifier_to_errno(ret);
1205
1206        if (ret) {
1207                /* err >= 0 after dev_alloc_name() or stores the first errno */
1208                if (err >= 0) {
1209                        err = ret;
1210                        write_seqcount_begin(&devnet_rename_seq);
1211                        memcpy(dev->name, oldname, IFNAMSIZ);
1212                        memcpy(oldname, newname, IFNAMSIZ);
1213                        dev->name_assign_type = old_assign_type;
1214                        old_assign_type = NET_NAME_RENAMED;
1215                        goto rollback;
1216                } else {
1217                        pr_err("%s: name change rollback failed: %d\n",
1218                               dev->name, ret);
1219                }
1220        }
1221
1222        return err;
1223}
1224
1225/**
1226 *      dev_set_alias - change ifalias of a device
1227 *      @dev: device
1228 *      @alias: name up to IFALIASZ
1229 *      @len: limit of bytes to copy from info
1230 *
1231 *      Set ifalias for a device,
1232 */
1233int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1234{
1235        char *new_ifalias;
1236
1237        ASSERT_RTNL();
1238
1239        if (len >= IFALIASZ)
1240                return -EINVAL;
1241
1242        if (!len) {
1243                kfree(dev->ifalias);
1244                dev->ifalias = NULL;
1245                return 0;
1246        }
1247
1248        new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1249        if (!new_ifalias)
1250                return -ENOMEM;
1251        dev->ifalias = new_ifalias;
1252
1253        strlcpy(dev->ifalias, alias, len+1);
1254        return len;
1255}
1256
1257
1258/**
1259 *      netdev_features_change - device changes features
1260 *      @dev: device to cause notification
1261 *
1262 *      Called to indicate a device has changed features.
1263 */
1264void netdev_features_change(struct net_device *dev)
1265{
1266        call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1267}
1268EXPORT_SYMBOL(netdev_features_change);
1269
1270/**
1271 *      netdev_state_change - device changes state
1272 *      @dev: device to cause notification
1273 *
1274 *      Called to indicate a device has changed state. This function calls
1275 *      the notifier chains for netdev_chain and sends a NEWLINK message
1276 *      to the routing socket.
1277 */
1278void netdev_state_change(struct net_device *dev)
1279{
1280        if (dev->flags & IFF_UP) {
1281                struct netdev_notifier_change_info change_info;
1282
1283                change_info.flags_changed = 0;
1284                call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1285                                              &change_info.info);
1286                rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1287        }
1288}
1289EXPORT_SYMBOL(netdev_state_change);
1290
1291/**
1292 *      netdev_notify_peers - notify network peers about existence of @dev
1293 *      @dev: network device
1294 *
1295 * Generate traffic such that interested network peers are aware of
1296 * @dev, such as by generating a gratuitous ARP. This may be used when
1297 * a device wants to inform the rest of the network about some sort of
1298 * reconfiguration such as a failover event or virtual machine
1299 * migration.
1300 */
1301void netdev_notify_peers(struct net_device *dev)
1302{
1303        rtnl_lock();
1304        call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1305        rtnl_unlock();
1306}
1307EXPORT_SYMBOL(netdev_notify_peers);
1308
1309static int __dev_open(struct net_device *dev)
1310{
1311        const struct net_device_ops *ops = dev->netdev_ops;
1312        int ret;
1313
1314        ASSERT_RTNL();
1315
1316        if (!netif_device_present(dev))
1317                return -ENODEV;
1318
1319        /* Block netpoll from trying to do any rx path servicing.
1320         * If we don't do this there is a chance ndo_poll_controller
1321         * or ndo_poll may be running while we open the device
1322         */
1323        netpoll_poll_disable(dev);
1324
1325        ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1326        ret = notifier_to_errno(ret);
1327        if (ret)
1328                return ret;
1329
1330        set_bit(__LINK_STATE_START, &dev->state);
1331
1332        if (ops->ndo_validate_addr)
1333                ret = ops->ndo_validate_addr(dev);
1334
1335        if (!ret && ops->ndo_open)
1336                ret = ops->ndo_open(dev);
1337
1338        netpoll_poll_enable(dev);
1339
1340        if (ret)
1341                clear_bit(__LINK_STATE_START, &dev->state);
1342        else {
1343                dev->flags |= IFF_UP;
1344                dev_set_rx_mode(dev);
1345                dev_activate(dev);
1346                add_device_randomness(dev->dev_addr, dev->addr_len);
1347        }
1348
1349        return ret;
1350}
1351
1352/**
1353 *      dev_open        - prepare an interface for use.
1354 *      @dev:   device to open
1355 *
1356 *      Takes a device from down to up state. The device's private open
1357 *      function is invoked and then the multicast lists are loaded. Finally
1358 *      the device is moved into the up state and a %NETDEV_UP message is
1359 *      sent to the netdev notifier chain.
1360 *
1361 *      Calling this function on an active interface is a nop. On a failure
1362 *      a negative errno code is returned.
1363 */
1364int dev_open(struct net_device *dev)
1365{
1366        int ret;
1367
1368        if (dev->flags & IFF_UP)
1369                return 0;
1370
1371        ret = __dev_open(dev);
1372        if (ret < 0)
1373                return ret;
1374
1375        rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1376        call_netdevice_notifiers(NETDEV_UP, dev);
1377
1378        return ret;
1379}
1380EXPORT_SYMBOL(dev_open);
1381
1382static int __dev_close_many(struct list_head *head)
1383{
1384        struct net_device *dev;
1385
1386        ASSERT_RTNL();
1387        might_sleep();
1388
1389        list_for_each_entry(dev, head, close_list) {
1390                /* Temporarily disable netpoll until the interface is down */
1391                netpoll_poll_disable(dev);
1392
1393                call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1394
1395                clear_bit(__LINK_STATE_START, &dev->state);
1396
1397                /* Synchronize to scheduled poll. We cannot touch poll list, it
1398                 * can be even on different cpu. So just clear netif_running().
1399                 *
1400                 * dev->stop() will invoke napi_disable() on all of it's
1401                 * napi_struct instances on this device.
1402                 */
1403                smp_mb__after_atomic(); /* Commit netif_running(). */
1404        }
1405
1406        dev_deactivate_many(head);
1407
1408        list_for_each_entry(dev, head, close_list) {
1409                const struct net_device_ops *ops = dev->netdev_ops;
1410
1411                /*
1412                 *      Call the device specific close. This cannot fail.
1413                 *      Only if device is UP
1414                 *
1415                 *      We allow it to be called even after a DETACH hot-plug
1416                 *      event.
1417                 */
1418                if (ops->ndo_stop)
1419                        ops->ndo_stop(dev);
1420
1421                dev->flags &= ~IFF_UP;
1422                netpoll_poll_enable(dev);
1423        }
1424
1425        return 0;
1426}
1427
1428static int __dev_close(struct net_device *dev)
1429{
1430        int retval;
1431        LIST_HEAD(single);
1432
1433        list_add(&dev->close_list, &single);
1434        retval = __dev_close_many(&single);
1435        list_del(&single);
1436
1437        return retval;
1438}
1439
1440int dev_close_many(struct list_head *head, bool unlink)
1441{
1442        struct net_device *dev, *tmp;
1443
1444        /* Remove the devices that don't need to be closed */
1445        list_for_each_entry_safe(dev, tmp, head, close_list)
1446                if (!(dev->flags & IFF_UP))
1447                        list_del_init(&dev->close_list);
1448
1449        __dev_close_many(head);
1450
1451        list_for_each_entry_safe(dev, tmp, head, close_list) {
1452                rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1453                call_netdevice_notifiers(NETDEV_DOWN, dev);
1454                if (unlink)
1455                        list_del_init(&dev->close_list);
1456        }
1457
1458        return 0;
1459}
1460EXPORT_SYMBOL(dev_close_many);
1461
1462/**
1463 *      dev_close - shutdown an interface.
1464 *      @dev: device to shutdown
1465 *
1466 *      This function moves an active device into down state. A
1467 *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1468 *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1469 *      chain.
1470 */
1471int dev_close(struct net_device *dev)
1472{
1473        if (dev->flags & IFF_UP) {
1474                LIST_HEAD(single);
1475
1476                list_add(&dev->close_list, &single);
1477                dev_close_many(&single, true);
1478                list_del(&single);
1479        }
1480        return 0;
1481}
1482EXPORT_SYMBOL(dev_close);
1483
1484
1485/**
1486 *      dev_disable_lro - disable Large Receive Offload on a device
1487 *      @dev: device
1488 *
1489 *      Disable Large Receive Offload (LRO) on a net device.  Must be
1490 *      called under RTNL.  This is needed if received packets may be
1491 *      forwarded to another interface.
1492 */
1493void dev_disable_lro(struct net_device *dev)
1494{
1495        struct net_device *lower_dev;
1496        struct list_head *iter;
1497
1498        dev->wanted_features &= ~NETIF_F_LRO;
1499        netdev_update_features(dev);
1500
1501        if (unlikely(dev->features & NETIF_F_LRO))
1502                netdev_WARN(dev, "failed to disable LRO!\n");
1503
1504        netdev_for_each_lower_dev(dev, lower_dev, iter)
1505                dev_disable_lro(lower_dev);
1506}
1507EXPORT_SYMBOL(dev_disable_lro);
1508
1509static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1510                                   struct net_device *dev)
1511{
1512        struct netdev_notifier_info info;
1513
1514        netdev_notifier_info_init(&info, dev);
1515        return nb->notifier_call(nb, val, &info);
1516}
1517
1518static int dev_boot_phase = 1;
1519
1520/**
1521 *      register_netdevice_notifier - register a network notifier block
1522 *      @nb: notifier
1523 *
1524 *      Register a notifier to be called when network device events occur.
1525 *      The notifier passed is linked into the kernel structures and must
1526 *      not be reused until it has been unregistered. A negative errno code
1527 *      is returned on a failure.
1528 *
1529 *      When registered all registration and up events are replayed
1530 *      to the new notifier to allow device to have a race free
1531 *      view of the network device list.
1532 */
1533
1534int register_netdevice_notifier(struct notifier_block *nb)
1535{
1536        struct net_device *dev;
1537        struct net_device *last;
1538        struct net *net;
1539        int err;
1540
1541        rtnl_lock();
1542        err = raw_notifier_chain_register(&netdev_chain, nb);
1543        if (err)
1544                goto unlock;
1545        if (dev_boot_phase)
1546                goto unlock;
1547        for_each_net(net) {
1548                for_each_netdev(net, dev) {
1549                        err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1550                        err = notifier_to_errno(err);
1551                        if (err)
1552                                goto rollback;
1553
1554                        if (!(dev->flags & IFF_UP))
1555                                continue;
1556
1557                        call_netdevice_notifier(nb, NETDEV_UP, dev);
1558                }
1559        }
1560
1561unlock:
1562        rtnl_unlock();
1563        return err;
1564
1565rollback:
1566        last = dev;
1567        for_each_net(net) {
1568                for_each_netdev(net, dev) {
1569                        if (dev == last)
1570                                goto outroll;
1571
1572                        if (dev->flags & IFF_UP) {
1573                                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1574                                                        dev);
1575                                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1576                        }
1577                        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1578                }
1579        }
1580
1581outroll:
1582        raw_notifier_chain_unregister(&netdev_chain, nb);
1583        goto unlock;
1584}
1585EXPORT_SYMBOL(register_netdevice_notifier);
1586
1587/**
1588 *      unregister_netdevice_notifier - unregister a network notifier block
1589 *      @nb: notifier
1590 *
1591 *      Unregister a notifier previously registered by
1592 *      register_netdevice_notifier(). The notifier is unlinked into the
1593 *      kernel structures and may then be reused. A negative errno code
1594 *      is returned on a failure.
1595 *
1596 *      After unregistering unregister and down device events are synthesized
1597 *      for all devices on the device list to the removed notifier to remove
1598 *      the need for special case cleanup code.
1599 */
1600
1601int unregister_netdevice_notifier(struct notifier_block *nb)
1602{
1603        struct net_device *dev;
1604        struct net *net;
1605        int err;
1606
1607        rtnl_lock();
1608        err = raw_notifier_chain_unregister(&netdev_chain, nb);
1609        if (err)
1610                goto unlock;
1611
1612        for_each_net(net) {
1613                for_each_netdev(net, dev) {
1614                        if (dev->flags & IFF_UP) {
1615                                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1616                                                        dev);
1617                                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1618                        }
1619                        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1620                }
1621        }
1622unlock:
1623        rtnl_unlock();
1624        return err;
1625}
1626EXPORT_SYMBOL(unregister_netdevice_notifier);
1627
1628/**
1629 *      call_netdevice_notifiers_info - call all network notifier blocks
1630 *      @val: value passed unmodified to notifier function
1631 *      @dev: net_device pointer passed unmodified to notifier function
1632 *      @info: notifier information data
1633 *
1634 *      Call all network notifier blocks.  Parameters and return value
1635 *      are as for raw_notifier_call_chain().
1636 */
1637
1638static int call_netdevice_notifiers_info(unsigned long val,
1639                                         struct net_device *dev,
1640                                         struct netdev_notifier_info *info)
1641{
1642        ASSERT_RTNL();
1643        netdev_notifier_info_init(info, dev);
1644        return raw_notifier_call_chain(&netdev_chain, val, info);
1645}
1646
1647/**
1648 *      call_netdevice_notifiers - call all network notifier blocks
1649 *      @val: value passed unmodified to notifier function
1650 *      @dev: net_device pointer passed unmodified to notifier function
1651 *
1652 *      Call all network notifier blocks.  Parameters and return value
1653 *      are as for raw_notifier_call_chain().
1654 */
1655
1656int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1657{
1658        struct netdev_notifier_info info;
1659
1660        return call_netdevice_notifiers_info(val, dev, &info);
1661}
1662EXPORT_SYMBOL(call_netdevice_notifiers);
1663
1664#ifdef CONFIG_NET_INGRESS
1665static struct static_key ingress_needed __read_mostly;
1666
1667void net_inc_ingress_queue(void)
1668{
1669        static_key_slow_inc(&ingress_needed);
1670}
1671EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1672
1673void net_dec_ingress_queue(void)
1674{
1675        static_key_slow_dec(&ingress_needed);
1676}
1677EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1678#endif
1679
1680#ifdef CONFIG_NET_EGRESS
1681static struct static_key egress_needed __read_mostly;
1682
1683void net_inc_egress_queue(void)
1684{
1685        static_key_slow_inc(&egress_needed);
1686}
1687EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1688
1689void net_dec_egress_queue(void)
1690{
1691        static_key_slow_dec(&egress_needed);
1692}
1693EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1694#endif
1695
1696static struct static_key netstamp_needed __read_mostly;
1697#ifdef HAVE_JUMP_LABEL
1698static atomic_t netstamp_needed_deferred;
1699static void netstamp_clear(struct work_struct *work)
1700{
1701        int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1702
1703        while (deferred--)
1704                static_key_slow_dec(&netstamp_needed);
1705}
1706static DECLARE_WORK(netstamp_work, netstamp_clear);
1707#endif
1708
1709void net_enable_timestamp(void)
1710{
1711        static_key_slow_inc(&netstamp_needed);
1712}
1713EXPORT_SYMBOL(net_enable_timestamp);
1714
1715void net_disable_timestamp(void)
1716{
1717#ifdef HAVE_JUMP_LABEL
1718        /* net_disable_timestamp() can be called from non process context */
1719        atomic_inc(&netstamp_needed_deferred);
1720        schedule_work(&netstamp_work);
1721#else
1722        static_key_slow_dec(&netstamp_needed);
1723#endif
1724}
1725EXPORT_SYMBOL(net_disable_timestamp);
1726
1727static inline void net_timestamp_set(struct sk_buff *skb)
1728{
1729        skb->tstamp = 0;
1730        if (static_key_false(&netstamp_needed))
1731                __net_timestamp(skb);
1732}
1733
1734#define net_timestamp_check(COND, SKB)                  \
1735        if (static_key_false(&netstamp_needed)) {               \
1736                if ((COND) && !(SKB)->tstamp)   \
1737                        __net_timestamp(SKB);           \
1738        }                                               \
1739
1740bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1741{
1742        unsigned int len;
1743
1744        if (!(dev->flags & IFF_UP))
1745                return false;
1746
1747        len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1748        if (skb->len <= len)
1749                return true;
1750
1751        /* if TSO is enabled, we don't care about the length as the packet
1752         * could be forwarded without being segmented before
1753         */
1754        if (skb_is_gso(skb))
1755                return true;
1756
1757        return false;
1758}
1759EXPORT_SYMBOL_GPL(is_skb_forwardable);
1760
1761int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1762{
1763        int ret = ____dev_forward_skb(dev, skb);
1764
1765        if (likely(!ret)) {
1766                skb->protocol = eth_type_trans(skb, dev);
1767                skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1768        }
1769
1770        return ret;
1771}
1772EXPORT_SYMBOL_GPL(__dev_forward_skb);
1773
1774/**
1775 * dev_forward_skb - loopback an skb to another netif
1776 *
1777 * @dev: destination network device
1778 * @skb: buffer to forward
1779 *
1780 * return values:
1781 *      NET_RX_SUCCESS  (no congestion)
1782 *      NET_RX_DROP     (packet was dropped, but freed)
1783 *
1784 * dev_forward_skb can be used for injecting an skb from the
1785 * start_xmit function of one device into the receive queue
1786 * of another device.
1787 *
1788 * The receiving device may be in another namespace, so
1789 * we have to clear all information in the skb that could
1790 * impact namespace isolation.
1791 */
1792int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1793{
1794        return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1795}
1796EXPORT_SYMBOL_GPL(dev_forward_skb);
1797
1798static inline int deliver_skb(struct sk_buff *skb,
1799                              struct packet_type *pt_prev,
1800                              struct net_device *orig_dev)
1801{
1802        if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1803                return -ENOMEM;
1804        atomic_inc(&skb->users);
1805        return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1806}
1807
1808static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1809                                          struct packet_type **pt,
1810                                          struct net_device *orig_dev,
1811                                          __be16 type,
1812                                          struct list_head *ptype_list)
1813{
1814        struct packet_type *ptype, *pt_prev = *pt;
1815
1816        list_for_each_entry_rcu(ptype, ptype_list, list) {
1817                if (ptype->type != type)
1818                        continue;
1819                if (pt_prev)
1820                        deliver_skb(skb, pt_prev, orig_dev);
1821                pt_prev = ptype;
1822        }
1823        *pt = pt_prev;
1824}
1825
1826static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1827{
1828        if (!ptype->af_packet_priv || !skb->sk)
1829                return false;
1830
1831        if (ptype->id_match)
1832                return ptype->id_match(ptype, skb->sk);
1833        else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1834                return true;
1835
1836        return false;
1837}
1838
1839/*
1840 *      Support routine. Sends outgoing frames to any network
1841 *      taps currently in use.
1842 */
1843
1844void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1845{
1846        struct packet_type *ptype;
1847        struct sk_buff *skb2 = NULL;
1848        struct packet_type *pt_prev = NULL;
1849        struct list_head *ptype_list = &ptype_all;
1850
1851        rcu_read_lock();
1852again:
1853        list_for_each_entry_rcu(ptype, ptype_list, list) {
1854                /* Never send packets back to the socket
1855                 * they originated from - MvS (miquels@drinkel.ow.org)
1856                 */
1857                if (skb_loop_sk(ptype, skb))
1858                        continue;
1859
1860                if (pt_prev) {
1861                        deliver_skb(skb2, pt_prev, skb->dev);
1862                        pt_prev = ptype;
1863                        continue;
1864                }
1865
1866                /* need to clone skb, done only once */
1867                skb2 = skb_clone(skb, GFP_ATOMIC);
1868                if (!skb2)
1869                        goto out_unlock;
1870
1871                net_timestamp_set(skb2);
1872
1873                /* skb->nh should be correctly
1874                 * set by sender, so that the second statement is
1875                 * just protection against buggy protocols.
1876                 */
1877                skb_reset_mac_header(skb2);
1878
1879                if (skb_network_header(skb2) < skb2->data ||
1880                    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1881                        net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1882                                             ntohs(skb2->protocol),
1883                                             dev->name);
1884                        skb_reset_network_header(skb2);
1885                }
1886
1887                skb2->transport_header = skb2->network_header;
1888                skb2->pkt_type = PACKET_OUTGOING;
1889                pt_prev = ptype;
1890        }
1891
1892        if (ptype_list == &ptype_all) {
1893                ptype_list = &dev->ptype_all;
1894                goto again;
1895        }
1896out_unlock:
1897        if (pt_prev)
1898                pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1899        rcu_read_unlock();
1900}
1901EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1902
1903/**
1904 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1905 * @dev: Network device
1906 * @txq: number of queues available
1907 *
1908 * If real_num_tx_queues is changed the tc mappings may no longer be
1909 * valid. To resolve this verify the tc mapping remains valid and if
1910 * not NULL the mapping. With no priorities mapping to this
1911 * offset/count pair it will no longer be used. In the worst case TC0
1912 * is invalid nothing can be done so disable priority mappings. If is
1913 * expected that drivers will fix this mapping if they can before
1914 * calling netif_set_real_num_tx_queues.
1915 */
1916static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1917{
1918        int i;
1919        struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1920
1921        /* If TC0 is invalidated disable TC mapping */
1922        if (tc->offset + tc->count > txq) {
1923                pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1924                dev->num_tc = 0;
1925                return;
1926        }
1927
1928        /* Invalidated prio to tc mappings set to TC0 */
1929        for (i = 1; i < TC_BITMASK + 1; i++) {
1930                int q = netdev_get_prio_tc_map(dev, i);
1931
1932                tc = &dev->tc_to_txq[q];
1933                if (tc->offset + tc->count > txq) {
1934                        pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1935                                i, q);
1936                        netdev_set_prio_tc_map(dev, i, 0);
1937                }
1938        }
1939}
1940
1941int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
1942{
1943        if (dev->num_tc) {
1944                struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1945                int i;
1946
1947                for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
1948                        if ((txq - tc->offset) < tc->count)
1949                                return i;
1950                }
1951
1952                return -1;
1953        }
1954
1955        return 0;
1956}
1957
1958#ifdef CONFIG_XPS
1959static DEFINE_MUTEX(xps_map_mutex);
1960#define xmap_dereference(P)             \
1961        rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1962
1963static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
1964                             int tci, u16 index)
1965{
1966        struct xps_map *map = NULL;
1967        int pos;
1968
1969        if (dev_maps)
1970                map = xmap_dereference(dev_maps->cpu_map[tci]);
1971        if (!map)
1972                return false;
1973
1974        for (pos = map->len; pos--;) {
1975                if (map->queues[pos] != index)
1976                        continue;
1977
1978                if (map->len > 1) {
1979                        map->queues[pos] = map->queues[--map->len];
1980                        break;
1981                }
1982
1983                RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
1984                kfree_rcu(map, rcu);
1985                return false;
1986        }
1987
1988        return true;
1989}
1990
1991static bool remove_xps_queue_cpu(struct net_device *dev,
1992                                 struct xps_dev_maps *dev_maps,
1993                                 int cpu, u16 offset, u16 count)
1994{
1995        int num_tc = dev->num_tc ? : 1;
1996        bool active = false;
1997        int tci;
1998
1999        for (tci = cpu * num_tc; num_tc--; tci++) {
2000                int i, j;
2001
2002                for (i = count, j = offset; i--; j++) {
2003                        if (!remove_xps_queue(dev_maps, cpu, j))
2004                                break;
2005                }
2006
2007                active |= i < 0;
2008        }
2009
2010        return active;
2011}
2012
2013static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2014                                   u16 count)
2015{
2016        struct xps_dev_maps *dev_maps;
2017        int cpu, i;
2018        bool active = false;
2019
2020        mutex_lock(&xps_map_mutex);
2021        dev_maps = xmap_dereference(dev->xps_maps);
2022
2023        if (!dev_maps)
2024                goto out_no_maps;
2025
2026        for_each_possible_cpu(cpu)
2027                active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
2028                                               offset, count);
2029
2030        if (!active) {
2031                RCU_INIT_POINTER(dev->xps_maps, NULL);
2032                kfree_rcu(dev_maps, rcu);
2033        }
2034
2035        for (i = offset + (count - 1); count--; i--)
2036                netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2037                                             NUMA_NO_NODE);
2038
2039out_no_maps:
2040        mutex_unlock(&xps_map_mutex);
2041}
2042
2043static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2044{
2045        netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2046}
2047
2048static struct xps_map *expand_xps_map(struct xps_map *map,
2049                                      int cpu, u16 index)
2050{
2051        struct xps_map *new_map;
2052        int alloc_len = XPS_MIN_MAP_ALLOC;
2053        int i, pos;
2054
2055        for (pos = 0; map && pos < map->len; pos++) {
2056                if (map->queues[pos] != index)
2057                        continue;
2058                return map;
2059        }
2060
2061        /* Need to add queue to this CPU's existing map */
2062        if (map) {
2063                if (pos < map->alloc_len)
2064                        return map;
2065
2066                alloc_len = map->alloc_len * 2;
2067        }
2068
2069        /* Need to allocate new map to store queue on this CPU's map */
2070        new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2071                               cpu_to_node(cpu));
2072        if (!new_map)
2073                return NULL;
2074
2075        for (i = 0; i < pos; i++)
2076                new_map->queues[i] = map->queues[i];
2077        new_map->alloc_len = alloc_len;
2078        new_map->len = pos;
2079
2080        return new_map;
2081}
2082
2083int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2084                        u16 index)
2085{
2086        struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2087        int i, cpu, tci, numa_node_id = -2;
2088        int maps_sz, num_tc = 1, tc = 0;
2089        struct xps_map *map, *new_map;
2090        bool active = false;
2091
2092        if (dev->num_tc) {
2093                num_tc = dev->num_tc;
2094                tc = netdev_txq_to_tc(dev, index);
2095                if (tc < 0)
2096                        return -EINVAL;
2097        }
2098
2099        maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
2100        if (maps_sz < L1_CACHE_BYTES)
2101                maps_sz = L1_CACHE_BYTES;
2102
2103        mutex_lock(&xps_map_mutex);
2104
2105        dev_maps = xmap_dereference(dev->xps_maps);
2106
2107        /* allocate memory for queue storage */
2108        for_each_cpu_and(cpu, cpu_online_mask, mask) {
2109                if (!new_dev_maps)
2110                        new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2111                if (!new_dev_maps) {
2112                        mutex_unlock(&xps_map_mutex);
2113                        return -ENOMEM;
2114                }
2115
2116                tci = cpu * num_tc + tc;
2117                map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
2118                                 NULL;
2119
2120                map = expand_xps_map(map, cpu, index);
2121                if (!map)
2122                        goto error;
2123
2124                RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2125        }
2126
2127        if (!new_dev_maps)
2128                goto out_no_new_maps;
2129
2130        for_each_possible_cpu(cpu) {
2131                /* copy maps belonging to foreign traffic classes */
2132                for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
2133                        /* fill in the new device map from the old device map */
2134                        map = xmap_dereference(dev_maps->cpu_map[tci]);
2135                        RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2136                }
2137
2138                /* We need to explicitly update tci as prevous loop
2139                 * could break out early if dev_maps is NULL.
2140                 */
2141                tci = cpu * num_tc + tc;
2142
2143                if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2144                        /* add queue to CPU maps */
2145                        int pos = 0;
2146
2147                        map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2148                        while ((pos < map->len) && (map->queues[pos] != index))
2149                                pos++;
2150
2151                        if (pos == map->len)
2152                                map->queues[map->len++] = index;
2153#ifdef CONFIG_NUMA
2154                        if (numa_node_id == -2)
2155                                numa_node_id = cpu_to_node(cpu);
2156                        else if (numa_node_id != cpu_to_node(cpu))
2157                                numa_node_id = -1;
2158#endif
2159                } else if (dev_maps) {
2160                        /* fill in the new device map from the old device map */
2161                        map = xmap_dereference(dev_maps->cpu_map[tci]);
2162                        RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2163                }
2164
2165                /* copy maps belonging to foreign traffic classes */
2166                for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2167                        /* fill in the new device map from the old device map */
2168                        map = xmap_dereference(dev_maps->cpu_map[tci]);
2169                        RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2170                }
2171        }
2172
2173        rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2174
2175        /* Cleanup old maps */
2176        if (!dev_maps)
2177                goto out_no_old_maps;
2178
2179        for_each_possible_cpu(cpu) {
2180                for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2181                        new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2182                        map = xmap_dereference(dev_maps->cpu_map[tci]);
2183                        if (map && map != new_map)
2184                                kfree_rcu(map, rcu);
2185                }
2186        }
2187
2188        kfree_rcu(dev_maps, rcu);
2189
2190out_no_old_maps:
2191        dev_maps = new_dev_maps;
2192        active = true;
2193
2194out_no_new_maps:
2195        /* update Tx queue numa node */
2196        netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2197                                     (numa_node_id >= 0) ? numa_node_id :
2198                                     NUMA_NO_NODE);
2199
2200        if (!dev_maps)
2201                goto out_no_maps;
2202
2203        /* removes queue from unused CPUs */
2204        for_each_possible_cpu(cpu) {
2205                for (i = tc, tci = cpu * num_tc; i--; tci++)
2206                        active |= remove_xps_queue(dev_maps, tci, index);
2207                if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
2208                        active |= remove_xps_queue(dev_maps, tci, index);
2209                for (i = num_tc - tc, tci++; --i; tci++)
2210                        active |= remove_xps_queue(dev_maps, tci, index);
2211        }
2212
2213        /* free map if not active */
2214        if (!active) {
2215                RCU_INIT_POINTER(dev->xps_maps, NULL);
2216                kfree_rcu(dev_maps, rcu);
2217        }
2218
2219out_no_maps:
2220        mutex_unlock(&xps_map_mutex);
2221
2222        return 0;
2223error:
2224        /* remove any maps that we added */
2225        for_each_possible_cpu(cpu) {
2226                for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2227                        new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2228                        map = dev_maps ?
2229                              xmap_dereference(dev_maps->cpu_map[tci]) :
2230                              NULL;
2231                        if (new_map && new_map != map)
2232                                kfree(new_map);
2233                }
2234        }
2235
2236        mutex_unlock(&xps_map_mutex);
2237
2238        kfree(new_dev_maps);
2239        return -ENOMEM;
2240}
2241EXPORT_SYMBOL(netif_set_xps_queue);
2242
2243#endif
2244void netdev_reset_tc(struct net_device *dev)
2245{
2246#ifdef CONFIG_XPS
2247        netif_reset_xps_queues_gt(dev, 0);
2248#endif
2249        dev->num_tc = 0;
2250        memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2251        memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2252}
2253EXPORT_SYMBOL(netdev_reset_tc);
2254
2255int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2256{
2257        if (tc >= dev->num_tc)
2258                return -EINVAL;
2259
2260#ifdef CONFIG_XPS
2261        netif_reset_xps_queues(dev, offset, count);
2262#endif
2263        dev->tc_to_txq[tc].count = count;
2264        dev->tc_to_txq[tc].offset = offset;
2265        return 0;
2266}
2267EXPORT_SYMBOL(netdev_set_tc_queue);
2268
2269int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2270{
2271        if (num_tc > TC_MAX_QUEUE)
2272                return -EINVAL;
2273
2274#ifdef CONFIG_XPS
2275        netif_reset_xps_queues_gt(dev, 0);
2276#endif
2277        dev->num_tc = num_tc;
2278        return 0;
2279}
2280EXPORT_SYMBOL(netdev_set_num_tc);
2281
2282/*
2283 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2284 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2285 */
2286int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2287{
2288        int rc;
2289
2290        if (txq < 1 || txq > dev->num_tx_queues)
2291                return -EINVAL;
2292
2293        if (dev->reg_state == NETREG_REGISTERED ||
2294            dev->reg_state == NETREG_UNREGISTERING) {
2295                ASSERT_RTNL();
2296
2297                rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2298                                                  txq);
2299                if (rc)
2300                        return rc;
2301
2302                if (dev->num_tc)
2303                        netif_setup_tc(dev, txq);
2304
2305                if (txq < dev->real_num_tx_queues) {
2306                        qdisc_reset_all_tx_gt(dev, txq);
2307#ifdef CONFIG_XPS
2308                        netif_reset_xps_queues_gt(dev, txq);
2309#endif
2310                }
2311        }
2312
2313        dev->real_num_tx_queues = txq;
2314        return 0;
2315}
2316EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2317
2318#ifdef CONFIG_SYSFS
2319/**
2320 *      netif_set_real_num_rx_queues - set actual number of RX queues used
2321 *      @dev: Network device
2322 *      @rxq: Actual number of RX queues
2323 *
2324 *      This must be called either with the rtnl_lock held or before
2325 *      registration of the net device.  Returns 0 on success, or a
2326 *      negative error code.  If called before registration, it always
2327 *      succeeds.
2328 */
2329int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2330{
2331        int rc;
2332
2333        if (rxq < 1 || rxq > dev->num_rx_queues)
2334                return -EINVAL;
2335
2336        if (dev->reg_state == NETREG_REGISTERED) {
2337                ASSERT_RTNL();
2338
2339                rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2340                                                  rxq);
2341                if (rc)
2342                        return rc;
2343        }
2344
2345        dev->real_num_rx_queues = rxq;
2346        return 0;
2347}
2348EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2349#endif
2350
2351/**
2352 * netif_get_num_default_rss_queues - default number of RSS queues
2353 *
2354 * This routine should set an upper limit on the number of RSS queues
2355 * used by default by multiqueue devices.
2356 */
2357int netif_get_num_default_rss_queues(void)
2358{
2359        return is_kdump_kernel() ?
2360                1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2361}
2362EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2363
2364static void __netif_reschedule(struct Qdisc *q)
2365{
2366        struct softnet_data *sd;
2367        unsigned long flags;
2368
2369        local_irq_save(flags);
2370        sd = this_cpu_ptr(&softnet_data);
2371        q->next_sched = NULL;
2372        *sd->output_queue_tailp = q;
2373        sd->output_queue_tailp = &q->next_sched;
2374        raise_softirq_irqoff(NET_TX_SOFTIRQ);
2375        local_irq_restore(flags);
2376}
2377
2378void __netif_schedule(struct Qdisc *q)
2379{
2380        if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2381                __netif_reschedule(q);
2382}
2383EXPORT_SYMBOL(__netif_schedule);
2384
2385struct dev_kfree_skb_cb {
2386        enum skb_free_reason reason;
2387};
2388
2389static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2390{
2391        return (struct dev_kfree_skb_cb *)skb->cb;
2392}
2393
2394void netif_schedule_queue(struct netdev_queue *txq)
2395{
2396        rcu_read_lock();
2397        if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2398                struct Qdisc *q = rcu_dereference(txq->qdisc);
2399
2400                __netif_schedule(q);
2401        }
2402        rcu_read_unlock();
2403}
2404EXPORT_SYMBOL(netif_schedule_queue);
2405
2406/**
2407 *      netif_wake_subqueue - allow sending packets on subqueue
2408 *      @dev: network device
2409 *      @queue_index: sub queue index
2410 *
2411 * Resume individual transmit queue of a device with multiple transmit queues.
2412 */
2413void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2414{
2415        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2416
2417        if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2418                struct Qdisc *q;
2419
2420                rcu_read_lock();
2421                q = rcu_dereference(txq->qdisc);
2422                __netif_schedule(q);
2423                rcu_read_unlock();
2424        }
2425}
2426EXPORT_SYMBOL(netif_wake_subqueue);
2427
2428void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2429{
2430        if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2431                struct Qdisc *q;
2432
2433                rcu_read_lock();
2434                q = rcu_dereference(dev_queue->qdisc);
2435                __netif_schedule(q);
2436                rcu_read_unlock();
2437        }
2438}
2439EXPORT_SYMBOL(netif_tx_wake_queue);
2440
2441void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2442{
2443        unsigned long flags;
2444
2445        if (likely(atomic_read(&skb->users) == 1)) {
2446                smp_rmb();
2447                atomic_set(&skb->users, 0);
2448        } else if (likely(!atomic_dec_and_test(&skb->users))) {
2449                return;
2450        }
2451        get_kfree_skb_cb(skb)->reason = reason;
2452        local_irq_save(flags);
2453        skb->next = __this_cpu_read(softnet_data.completion_queue);
2454        __this_cpu_write(softnet_data.completion_queue, skb);
2455        raise_softirq_irqoff(NET_TX_SOFTIRQ);
2456        local_irq_restore(flags);
2457}
2458EXPORT_SYMBOL(__dev_kfree_skb_irq);
2459
2460void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2461{
2462        if (in_irq() || irqs_disabled())
2463                __dev_kfree_skb_irq(skb, reason);
2464        else
2465                dev_kfree_skb(skb);
2466}
2467EXPORT_SYMBOL(__dev_kfree_skb_any);
2468
2469
2470/**
2471 * netif_device_detach - mark device as removed
2472 * @dev: network device
2473 *
2474 * Mark device as removed from system and therefore no longer available.
2475 */
2476void netif_device_detach(struct net_device *dev)
2477{
2478        if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2479            netif_running(dev)) {
2480                netif_tx_stop_all_queues(dev);
2481        }
2482}
2483EXPORT_SYMBOL(netif_device_detach);
2484
2485/**
2486 * netif_device_attach - mark device as attached
2487 * @dev: network device
2488 *
2489 * Mark device as attached from system and restart if needed.
2490 */
2491void netif_device_attach(struct net_device *dev)
2492{
2493        if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2494            netif_running(dev)) {
2495                netif_tx_wake_all_queues(dev);
2496                __netdev_watchdog_up(dev);
2497        }
2498}
2499EXPORT_SYMBOL(netif_device_attach);
2500
2501/*
2502 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2503 * to be used as a distribution range.
2504 */
2505u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2506                  unsigned int num_tx_queues)
2507{
2508        u32 hash;
2509        u16 qoffset = 0;
2510        u16 qcount = num_tx_queues;
2511
2512        if (skb_rx_queue_recorded(skb)) {
2513                hash = skb_get_rx_queue(skb);
2514                while (unlikely(hash >= num_tx_queues))
2515                        hash -= num_tx_queues;
2516                return hash;
2517        }
2518
2519        if (dev->num_tc) {
2520                u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2521                qoffset = dev->tc_to_txq[tc].offset;
2522                qcount = dev->tc_to_txq[tc].count;
2523        }
2524
2525        return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2526}
2527EXPORT_SYMBOL(__skb_tx_hash);
2528
2529static void skb_warn_bad_offload(const struct sk_buff *skb)
2530{
2531        static const netdev_features_t null_features;
2532        struct net_device *dev = skb->dev;
2533        const char *name = "";
2534
2535        if (!net_ratelimit())
2536                return;
2537
2538        if (dev) {
2539                if (dev->dev.parent)
2540                        name = dev_driver_string(dev->dev.parent);
2541                else
2542                        name = netdev_name(dev);
2543        }
2544        WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2545             "gso_type=%d ip_summed=%d\n",
2546             name, dev ? &dev->features : &null_features,
2547             skb->sk ? &skb->sk->sk_route_caps : &null_features,
2548             skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2549             skb_shinfo(skb)->gso_type, skb->ip_summed);
2550}
2551
2552/*
2553 * Invalidate hardware checksum when packet is to be mangled, and
2554 * complete checksum manually on outgoing path.
2555 */
2556int skb_checksum_help(struct sk_buff *skb)
2557{
2558        __wsum csum;
2559        int ret = 0, offset;
2560
2561        if (skb->ip_summed == CHECKSUM_COMPLETE)
2562                goto out_set_summed;
2563
2564        if (unlikely(skb_shinfo(skb)->gso_size)) {
2565                skb_warn_bad_offload(skb);
2566                return -EINVAL;
2567        }
2568
2569        /* Before computing a checksum, we should make sure no frag could
2570         * be modified by an external entity : checksum could be wrong.
2571         */
2572        if (skb_has_shared_frag(skb)) {
2573                ret = __skb_linearize(skb);
2574                if (ret)
2575                        goto out;
2576        }
2577
2578        offset = skb_checksum_start_offset(skb);
2579        BUG_ON(offset >= skb_headlen(skb));
2580        csum = skb_checksum(skb, offset, skb->len - offset, 0);
2581
2582        offset += skb->csum_offset;
2583        BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2584
2585        if (skb_cloned(skb) &&
2586            !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2587                ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2588                if (ret)
2589                        goto out;
2590        }
2591
2592        *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2593out_set_summed:
2594        skb->ip_summed = CHECKSUM_NONE;
2595out:
2596        return ret;
2597}
2598EXPORT_SYMBOL(skb_checksum_help);
2599
2600__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2601{
2602        __be16 type = skb->protocol;
2603
2604        /* Tunnel gso handlers can set protocol to ethernet. */
2605        if (type == htons(ETH_P_TEB)) {
2606                struct ethhdr *eth;
2607
2608                if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2609                        return 0;
2610
2611                eth = (struct ethhdr *)skb_mac_header(skb);
2612                type = eth->h_proto;
2613        }
2614
2615        return __vlan_get_protocol(skb, type, depth);
2616}
2617
2618/**
2619 *      skb_mac_gso_segment - mac layer segmentation handler.
2620 *      @skb: buffer to segment
2621 *      @features: features for the output path (see dev->features)
2622 */
2623struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2624                                    netdev_features_t features)
2625{
2626        struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2627        struct packet_offload *ptype;
2628        int vlan_depth = skb->mac_len;
2629        __be16 type = skb_network_protocol(skb, &vlan_depth);
2630
2631        if (unlikely(!type))
2632                return ERR_PTR(-EINVAL);
2633
2634        __skb_pull(skb, vlan_depth);
2635
2636        rcu_read_lock();
2637        list_for_each_entry_rcu(ptype, &offload_base, list) {
2638                if (ptype->type == type && ptype->callbacks.gso_segment) {
2639                        segs = ptype->callbacks.gso_segment(skb, features);
2640                        break;
2641                }
2642        }
2643        rcu_read_unlock();
2644
2645        __skb_push(skb, skb->data - skb_mac_header(skb));
2646
2647        return segs;
2648}
2649EXPORT_SYMBOL(skb_mac_gso_segment);
2650
2651
2652/* openvswitch calls this on rx path, so we need a different check.
2653 */
2654static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2655{
2656        if (tx_path)
2657                return skb->ip_summed != CHECKSUM_PARTIAL;
2658        else
2659                return skb->ip_summed == CHECKSUM_NONE;
2660}
2661
2662/**
2663 *      __skb_gso_segment - Perform segmentation on skb.
2664 *      @skb: buffer to segment
2665 *      @features: features for the output path (see dev->features)
2666 *      @tx_path: whether it is called in TX path
2667 *
2668 *      This function segments the given skb and returns a list of segments.
2669 *
2670 *      It may return NULL if the skb requires no segmentation.  This is
2671 *      only possible when GSO is used for verifying header integrity.
2672 *
2673 *      Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2674 */
2675struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2676                                  netdev_features_t features, bool tx_path)
2677{
2678        if (unlikely(skb_needs_check(skb, tx_path))) {
2679                int err;
2680
2681                skb_warn_bad_offload(skb);
2682
2683                err = skb_cow_head(skb, 0);
2684                if (err < 0)
2685                        return ERR_PTR(err);
2686        }
2687
2688        /* Only report GSO partial support if it will enable us to
2689         * support segmentation on this frame without needing additional
2690         * work.
2691         */
2692        if (features & NETIF_F_GSO_PARTIAL) {
2693                netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2694                struct net_device *dev = skb->dev;
2695
2696                partial_features |= dev->features & dev->gso_partial_features;
2697                if (!skb_gso_ok(skb, features | partial_features))
2698                        features &= ~NETIF_F_GSO_PARTIAL;
2699        }
2700
2701        BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2702                     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2703
2704        SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2705        SKB_GSO_CB(skb)->encap_level = 0;
2706
2707        skb_reset_mac_header(skb);
2708        skb_reset_mac_len(skb);
2709
2710        return skb_mac_gso_segment(skb, features);
2711}
2712EXPORT_SYMBOL(__skb_gso_segment);
2713
2714/* Take action when hardware reception checksum errors are detected. */
2715#ifdef CONFIG_BUG
2716void netdev_rx_csum_fault(struct net_device *dev)
2717{
2718        if (net_ratelimit()) {
2719                pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2720                dump_stack();
2721        }
2722}
2723EXPORT_SYMBOL(netdev_rx_csum_fault);
2724#endif
2725
2726/* Actually, we should eliminate this check as soon as we know, that:
2727 * 1. IOMMU is present and allows to map all the memory.
2728 * 2. No high memory really exists on this machine.
2729 */
2730
2731static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2732{
2733#ifdef CONFIG_HIGHMEM
2734        int i;
2735        if (!(dev->features & NETIF_F_HIGHDMA)) {
2736                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2737                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2738                        if (PageHighMem(skb_frag_page(frag)))
2739                                return 1;
2740                }
2741        }
2742
2743        if (PCI_DMA_BUS_IS_PHYS) {
2744                struct device *pdev = dev->dev.parent;
2745
2746                if (!pdev)
2747                        return 0;
2748                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2749                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2750                        dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2751                        if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2752                                return 1;
2753                }
2754        }
2755#endif
2756        return 0;
2757}
2758
2759/* If MPLS offload request, verify we are testing hardware MPLS features
2760 * instead of standard features for the netdev.
2761 */
2762#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2763static netdev_features_t net_mpls_features(struct sk_buff *skb,
2764                                           netdev_features_t features,
2765                                           __be16 type)
2766{
2767        if (eth_p_mpls(type))
2768                features &= skb->dev->mpls_features;
2769
2770        return features;
2771}
2772#else
2773static netdev_features_t net_mpls_features(struct sk_buff *skb,
2774                                           netdev_features_t features,
2775                                           __be16 type)
2776{
2777        return features;
2778}
2779#endif
2780
2781static netdev_features_t harmonize_features(struct sk_buff *skb,
2782        netdev_features_t features)
2783{
2784        int tmp;
2785        __be16 type;
2786
2787        type = skb_network_protocol(skb, &tmp);
2788        features = net_mpls_features(skb, features, type);
2789
2790        if (skb->ip_summed != CHECKSUM_NONE &&
2791            !can_checksum_protocol(features, type)) {
2792                features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2793        }
2794        if (illegal_highdma(skb->dev, skb))
2795                features &= ~NETIF_F_SG;
2796
2797        return features;
2798}
2799
2800netdev_features_t passthru_features_check(struct sk_buff *skb,
2801                                          struct net_device *dev,
2802                                          netdev_features_t features)
2803{
2804        return features;
2805}
2806EXPORT_SYMBOL(passthru_features_check);
2807
2808static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2809                                             struct net_device *dev,
2810                                             netdev_features_t features)
2811{
2812        return vlan_features_check(skb, features);
2813}
2814
2815static netdev_features_t gso_features_check(const struct sk_buff *skb,
2816                                            struct net_device *dev,
2817                                            netdev_features_t features)
2818{
2819        u16 gso_segs = skb_shinfo(skb)->gso_segs;
2820
2821        if (gso_segs > dev->gso_max_segs)
2822                return features & ~NETIF_F_GSO_MASK;
2823
2824        /* Support for GSO partial features requires software
2825         * intervention before we can actually process the packets
2826         * so we need to strip support for any partial features now
2827         * and we can pull them back in after we have partially
2828         * segmented the frame.
2829         */
2830        if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2831                features &= ~dev->gso_partial_features;
2832
2833        /* Make sure to clear the IPv4 ID mangling feature if the
2834         * IPv4 header has the potential to be fragmented.
2835         */
2836        if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2837                struct iphdr *iph = skb->encapsulation ?
2838                                    inner_ip_hdr(skb) : ip_hdr(skb);
2839
2840                if (!(iph->frag_off & htons(IP_DF)))
2841                        features &= ~NETIF_F_TSO_MANGLEID;
2842        }
2843
2844        return features;
2845}
2846
2847netdev_features_t netif_skb_features(struct sk_buff *skb)
2848{
2849        struct net_device *dev = skb->dev;
2850        netdev_features_t features = dev->features;
2851
2852        if (skb_is_gso(skb))
2853                features = gso_features_check(skb, dev, features);
2854
2855        /* If encapsulation offload request, verify we are testing
2856         * hardware encapsulation features instead of standard
2857         * features for the netdev
2858         */
2859        if (skb->encapsulation)
2860                features &= dev->hw_enc_features;
2861
2862        if (skb_vlan_tagged(skb))
2863                features = netdev_intersect_features(features,
2864                                                     dev->vlan_features |
2865                                                     NETIF_F_HW_VLAN_CTAG_TX |
2866                                                     NETIF_F_HW_VLAN_STAG_TX);
2867
2868        if (dev->netdev_ops->ndo_features_check)
2869                features &= dev->netdev_ops->ndo_features_check(skb, dev,
2870                                                                features);
2871        else
2872                features &= dflt_features_check(skb, dev, features);
2873
2874        return harmonize_features(skb, features);
2875}
2876EXPORT_SYMBOL(netif_skb_features);
2877
2878static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2879                    struct netdev_queue *txq, bool more)
2880{
2881        unsigned int len;
2882        int rc;
2883
2884        if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2885                dev_queue_xmit_nit(skb, dev);
2886
2887        len = skb->len;
2888        trace_net_dev_start_xmit(skb, dev);
2889        rc = netdev_start_xmit(skb, dev, txq, more);
2890        trace_net_dev_xmit(skb, rc, dev, len);
2891
2892        return rc;
2893}
2894
2895struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2896                                    struct netdev_queue *txq, int *ret)
2897{
2898        struct sk_buff *skb = first;
2899        int rc = NETDEV_TX_OK;
2900
2901        while (skb) {
2902                struct sk_buff *next = skb->next;
2903
2904                skb->next = NULL;
2905                rc = xmit_one(skb, dev, txq, next != NULL);
2906                if (unlikely(!dev_xmit_complete(rc))) {
2907                        skb->next = next;
2908                        goto out;
2909                }
2910
2911                skb = next;
2912                if (netif_xmit_stopped(txq) && skb) {
2913                        rc = NETDEV_TX_BUSY;
2914                        break;
2915                }
2916        }
2917
2918out:
2919        *ret = rc;
2920        return skb;
2921}
2922
2923static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2924                                          netdev_features_t features)
2925{
2926        if (skb_vlan_tag_present(skb) &&
2927            !vlan_hw_offload_capable(features, skb->vlan_proto))
2928                skb = __vlan_hwaccel_push_inside(skb);
2929        return skb;
2930}
2931
2932static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2933{
2934        netdev_features_t features;
2935
2936        features = netif_skb_features(skb);
2937        skb = validate_xmit_vlan(skb, features);
2938        if (unlikely(!skb))
2939                goto out_null;
2940
2941        if (netif_needs_gso(skb, features)) {
2942                struct sk_buff *segs;
2943
2944                segs = skb_gso_segment(skb, features);
2945                if (IS_ERR(segs)) {
2946                        goto out_kfree_skb;
2947                } else if (segs) {
2948                        consume_skb(skb);
2949                        skb = segs;
2950                }
2951        } else {
2952                if (skb_needs_linearize(skb, features) &&
2953                    __skb_linearize(skb))
2954                        goto out_kfree_skb;
2955
2956                /* If packet is not checksummed and device does not
2957                 * support checksumming for this protocol, complete
2958                 * checksumming here.
2959                 */
2960                if (skb->ip_summed == CHECKSUM_PARTIAL) {
2961                        if (skb->encapsulation)
2962                                skb_set_inner_transport_header(skb,
2963                                                               skb_checksum_start_offset(skb));
2964                        else
2965                                skb_set_transport_header(skb,
2966                                                         skb_checksum_start_offset(skb));
2967                        if (!(features & NETIF_F_CSUM_MASK) &&
2968                            skb_checksum_help(skb))
2969                                goto out_kfree_skb;
2970                }
2971        }
2972
2973        return skb;
2974
2975out_kfree_skb:
2976        kfree_skb(skb);
2977out_null:
2978        atomic_long_inc(&dev->tx_dropped);
2979        return NULL;
2980}
2981
2982struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2983{
2984        struct sk_buff *next, *head = NULL, *tail;
2985
2986        for (; skb != NULL; skb = next) {
2987                next = skb->next;
2988                skb->next = NULL;
2989
2990                /* in case skb wont be segmented, point to itself */
2991                skb->prev = skb;
2992
2993                skb = validate_xmit_skb(skb, dev);
2994                if (!skb)
2995                        continue;
2996
2997                if (!head)
2998                        head = skb;
2999                else
3000                        tail->next = skb;
3001                /* If skb was segmented, skb->prev points to
3002                 * the last segment. If not, it still contains skb.
3003                 */
3004                tail = skb->prev;
3005        }
3006        return head;
3007}
3008EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3009
3010static void qdisc_pkt_len_init(struct sk_buff *skb)
3011{
3012        const struct skb_shared_info *shinfo = skb_shinfo(skb);
3013
3014        qdisc_skb_cb(skb)->pkt_len = skb->len;
3015
3016        /* To get more precise estimation of bytes sent on wire,
3017         * we add to pkt_len the headers size of all segments
3018         */
3019        if (shinfo->gso_size)  {
3020                unsigned int hdr_len;
3021                u16 gso_segs = shinfo->gso_segs;
3022
3023                /* mac layer + network layer */
3024                hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3025
3026                /* + transport layer */
3027                if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3028                        hdr_len += tcp_hdrlen(skb);
3029                else
3030                        hdr_len += sizeof(struct udphdr);
3031
3032                if (shinfo->gso_type & SKB_GSO_DODGY)
3033                        gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3034                                                shinfo->gso_size);
3035
3036                qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3037        }
3038}
3039
3040static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3041                                 struct net_device *dev,
3042                                 struct netdev_queue *txq)
3043{
3044        spinlock_t *root_lock = qdisc_lock(q);
3045        struct sk_buff *to_free = NULL;
3046        bool contended;
3047        int rc;
3048
3049        qdisc_calculate_pkt_len(skb, q);
3050        /*
3051         * Heuristic to force contended enqueues to serialize on a
3052         * separate lock before trying to get qdisc main lock.
3053         * This permits qdisc->running owner to get the lock more
3054         * often and dequeue packets faster.
3055         */
3056        contended = qdisc_is_running(q);
3057        if (unlikely(contended))
3058                spin_lock(&q->busylock);
3059
3060        spin_lock(root_lock);
3061        if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3062                __qdisc_drop(skb, &to_free);
3063                rc = NET_XMIT_DROP;
3064        } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3065                   qdisc_run_begin(q)) {
3066                /*
3067                 * This is a work-conserving queue; there are no old skbs
3068                 * waiting to be sent out; and the qdisc is not running -
3069                 * xmit the skb directly.
3070                 */
3071
3072                qdisc_bstats_update(q, skb);
3073
3074                if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3075                        if (unlikely(contended)) {
3076                                spin_unlock(&q->busylock);
3077                                contended = false;
3078                        }
3079                        __qdisc_run(q);
3080                } else
3081                        qdisc_run_end(q);
3082
3083                rc = NET_XMIT_SUCCESS;
3084        } else {
3085                rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3086                if (qdisc_run_begin(q)) {
3087                        if (unlikely(contended)) {
3088                                spin_unlock(&q->busylock);
3089                                contended = false;
3090                        }
3091                        __qdisc_run(q);
3092                }
3093        }
3094        spin_unlock(root_lock);
3095        if (unlikely(to_free))
3096                kfree_skb_list(to_free);
3097        if (unlikely(contended))
3098                spin_unlock(&q->busylock);
3099        return rc;
3100}
3101
3102#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3103static void skb_update_prio(struct sk_buff *skb)
3104{
3105        struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3106
3107        if (!skb->priority && skb->sk && map) {
3108                unsigned int prioidx =
3109                        sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3110
3111                if (prioidx < map->priomap_len)
3112                        skb->priority = map->priomap[prioidx];
3113        }
3114}
3115#else
3116#define skb_update_prio(skb)
3117#endif
3118
3119DEFINE_PER_CPU(int, xmit_recursion);
3120EXPORT_SYMBOL(xmit_recursion);
3121
3122/**
3123 *      dev_loopback_xmit - loop back @skb
3124 *      @net: network namespace this loopback is happening in
3125 *      @sk:  sk needed to be a netfilter okfn
3126 *      @skb: buffer to transmit
3127 */
3128int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3129{
3130        skb_reset_mac_header(skb);
3131        __skb_pull(skb, skb_network_offset(skb));
3132        skb->pkt_type = PACKET_LOOPBACK;
3133        skb->ip_summed = CHECKSUM_UNNECESSARY;
3134        WARN_ON(!skb_dst(skb));
3135        skb_dst_force(skb);
3136        netif_rx_ni(skb);
3137        return 0;
3138}
3139EXPORT_SYMBOL(dev_loopback_xmit);
3140
3141#ifdef CONFIG_NET_EGRESS
3142static struct sk_buff *
3143sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3144{
3145        struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3146        struct tcf_result cl_res;
3147
3148        if (!cl)
3149                return skb;
3150
3151        /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3152         * earlier by the caller.
3153         */
3154        qdisc_bstats_cpu_update(cl->q, skb);
3155
3156        switch (tc_classify(skb, cl, &cl_res, false)) {
3157        case TC_ACT_OK:
3158        case TC_ACT_RECLASSIFY:
3159                skb->tc_index = TC_H_MIN(cl_res.classid);
3160                break;
3161        case TC_ACT_SHOT:
3162                qdisc_qstats_cpu_drop(cl->q);
3163                *ret = NET_XMIT_DROP;
3164                kfree_skb(skb);
3165                return NULL;
3166        case TC_ACT_STOLEN:
3167        case TC_ACT_QUEUED:
3168                *ret = NET_XMIT_SUCCESS;
3169                consume_skb(skb);
3170                return NULL;
3171        case TC_ACT_REDIRECT:
3172                /* No need to push/pop skb's mac_header here on egress! */
3173                skb_do_redirect(skb);
3174                *ret = NET_XMIT_SUCCESS;
3175                return NULL;
3176        default:
3177                break;
3178        }
3179
3180        return skb;
3181}
3182#endif /* CONFIG_NET_EGRESS */
3183
3184static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3185{
3186#ifdef CONFIG_XPS
3187        struct xps_dev_maps *dev_maps;
3188        struct xps_map *map;
3189        int queue_index = -1;
3190
3191        rcu_read_lock();
3192        dev_maps = rcu_dereference(dev->xps_maps);
3193        if (dev_maps) {
3194                unsigned int tci = skb->sender_cpu - 1;
3195
3196                if (dev->num_tc) {
3197                        tci *= dev->num_tc;
3198                        tci += netdev_get_prio_tc_map(dev, skb->priority);
3199                }
3200
3201                map = rcu_dereference(dev_maps->cpu_map[tci]);
3202                if (map) {
3203                        if (map->len == 1)
3204                                queue_index = map->queues[0];
3205                        else
3206                                queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3207                                                                           map->len)];
3208                        if (unlikely(queue_index >= dev->real_num_tx_queues))
3209                                queue_index = -1;
3210                }
3211        }
3212        rcu_read_unlock();
3213
3214        return queue_index;
3215#else
3216        return -1;
3217#endif
3218}
3219
3220static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3221{
3222        struct sock *sk = skb->sk;
3223        int queue_index = sk_tx_queue_get(sk);
3224
3225        if (queue_index < 0 || skb->ooo_okay ||
3226            queue_index >= dev->real_num_tx_queues) {
3227                int new_index = get_xps_queue(dev, skb);
3228                if (new_index < 0)
3229                        new_index = skb_tx_hash(dev, skb);
3230
3231                if (queue_index != new_index && sk &&
3232                    sk_fullsock(sk) &&
3233                    rcu_access_pointer(sk->sk_dst_cache))
3234                        sk_tx_queue_set(sk, new_index);
3235
3236                queue_index = new_index;
3237        }
3238
3239        return queue_index;
3240}
3241
3242struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3243                                    struct sk_buff *skb,
3244                                    void *accel_priv)
3245{
3246        int queue_index = 0;
3247
3248#ifdef CONFIG_XPS
3249        u32 sender_cpu = skb->sender_cpu - 1;
3250
3251        if (sender_cpu >= (u32)NR_CPUS)
3252                skb->sender_cpu = raw_smp_processor_id() + 1;
3253#endif
3254
3255        if (dev->real_num_tx_queues != 1) {
3256                const struct net_device_ops *ops = dev->netdev_ops;
3257                if (ops->ndo_select_queue)
3258                        queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3259                                                            __netdev_pick_tx);
3260                else
3261                        queue_index = __netdev_pick_tx(dev, skb);
3262
3263                if (!accel_priv)
3264                        queue_index = netdev_cap_txqueue(dev, queue_index);
3265        }
3266
3267        skb_set_queue_mapping(skb, queue_index);
3268        return netdev_get_tx_queue(dev, queue_index);
3269}
3270
3271/**
3272 *      __dev_queue_xmit - transmit a buffer
3273 *      @skb: buffer to transmit
3274 *      @accel_priv: private data used for L2 forwarding offload
3275 *
3276 *      Queue a buffer for transmission to a network device. The caller must
3277 *      have set the device and priority and built the buffer before calling
3278 *      this function. The function can be called from an interrupt.
3279 *
3280 *      A negative errno code is returned on a failure. A success does not
3281 *      guarantee the frame will be transmitted as it may be dropped due
3282 *      to congestion or traffic shaping.
3283 *
3284 * -----------------------------------------------------------------------------------
3285 *      I notice this method can also return errors from the queue disciplines,
3286 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3287 *      be positive.
3288 *
3289 *      Regardless of the return value, the skb is consumed, so it is currently
3290 *      difficult to retry a send to this method.  (You can bump the ref count
3291 *      before sending to hold a reference for retry if you are careful.)
3292 *
3293 *      When calling this method, interrupts MUST be enabled.  This is because
3294 *      the BH enable code must have IRQs enabled so that it will not deadlock.
3295 *          --BLG
3296 */
3297static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3298{
3299        struct net_device *dev = skb->dev;
3300        struct netdev_queue *txq;
3301        struct Qdisc *q;
3302        int rc = -ENOMEM;
3303
3304        skb_reset_mac_header(skb);
3305
3306        if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3307                __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3308
3309        /* Disable soft irqs for various locks below. Also
3310         * stops preemption for RCU.
3311         */
3312        rcu_read_lock_bh();
3313
3314        skb_update_prio(skb);
3315
3316        qdisc_pkt_len_init(skb);
3317#ifdef CONFIG_NET_CLS_ACT
3318        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3319# ifdef CONFIG_NET_EGRESS
3320        if (static_key_false(&egress_needed)) {
3321                skb = sch_handle_egress(skb, &rc, dev);
3322                if (!skb)
3323                        goto out;
3324        }
3325# endif
3326#endif
3327        /* If device/qdisc don't need skb->dst, release it right now while
3328         * its hot in this cpu cache.
3329         */
3330        if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3331                skb_dst_drop(skb);
3332        else
3333                skb_dst_force(skb);
3334
3335        txq = netdev_pick_tx(dev, skb, accel_priv);
3336        q = rcu_dereference_bh(txq->qdisc);
3337
3338        trace_net_dev_queue(skb);
3339        if (q->enqueue) {
3340                rc = __dev_xmit_skb(skb, q, dev, txq);
3341                goto out;
3342        }
3343
3344        /* The device has no queue. Common case for software devices:
3345           loopback, all the sorts of tunnels...
3346
3347           Really, it is unlikely that netif_tx_lock protection is necessary
3348           here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3349           counters.)
3350           However, it is possible, that they rely on protection
3351           made by us here.
3352
3353           Check this and shot the lock. It is not prone from deadlocks.
3354           Either shot noqueue qdisc, it is even simpler 8)
3355         */
3356        if (dev->flags & IFF_UP) {
3357                int cpu = smp_processor_id(); /* ok because BHs are off */
3358
3359                if (txq->xmit_lock_owner != cpu) {
3360                        if (unlikely(__this_cpu_read(xmit_recursion) >
3361                                     XMIT_RECURSION_LIMIT))
3362                                goto recursion_alert;
3363
3364                        skb = validate_xmit_skb(skb, dev);
3365                        if (!skb)
3366                                goto out;
3367
3368                        HARD_TX_LOCK(dev, txq, cpu);
3369
3370                        if (!netif_xmit_stopped(txq)) {
3371                                __this_cpu_inc(xmit_recursion);
3372                                skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3373                                __this_cpu_dec(xmit_recursion);
3374                                if (dev_xmit_complete(rc)) {
3375                                        HARD_TX_UNLOCK(dev, txq);
3376                                        goto out;
3377                                }
3378                        }
3379                        HARD_TX_UNLOCK(dev, txq);
3380                        net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3381                                             dev->name);
3382                } else {
3383                        /* Recursion is detected! It is possible,
3384                         * unfortunately
3385                         */
3386recursion_alert:
3387                        net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3388                                             dev->name);
3389                }
3390        }
3391
3392        rc = -ENETDOWN;
3393        rcu_read_unlock_bh();
3394
3395        atomic_long_inc(&dev->tx_dropped);
3396        kfree_skb_list(skb);
3397        return rc;
3398out:
3399        rcu_read_unlock_bh();
3400        return rc;
3401}
3402
3403int dev_queue_xmit(struct sk_buff *skb)
3404{
3405        return __dev_queue_xmit(skb, NULL);
3406}
3407EXPORT_SYMBOL(dev_queue_xmit);
3408
3409int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3410{
3411        return __dev_queue_xmit(skb, accel_priv);
3412}
3413EXPORT_SYMBOL(dev_queue_xmit_accel);
3414
3415
3416/*=======================================================================
3417                        Receiver routines
3418  =======================================================================*/
3419
3420int netdev_max_backlog __read_mostly = 1000;
3421EXPORT_SYMBOL(netdev_max_backlog);
3422
3423int netdev_tstamp_prequeue __read_mostly = 1;
3424int netdev_budget __read_mostly = 300;
3425int weight_p __read_mostly = 64;            /* old backlog weight */
3426
3427/* Called with irq disabled */
3428static inline void ____napi_schedule(struct softnet_data *sd,
3429                                     struct napi_struct *napi)
3430{
3431        list_add_tail(&napi->poll_list, &sd->poll_list);
3432        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3433}
3434
3435#ifdef CONFIG_RPS
3436
3437/* One global table that all flow-based protocols share. */
3438struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3439EXPORT_SYMBOL(rps_sock_flow_table);
3440u32 rps_cpu_mask __read_mostly;
3441EXPORT_SYMBOL(rps_cpu_mask);
3442
3443struct static_key rps_needed __read_mostly;
3444EXPORT_SYMBOL(rps_needed);
3445struct static_key rfs_needed __read_mostly;
3446EXPORT_SYMBOL(rfs_needed);
3447
3448static struct rps_dev_flow *
3449set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3450            struct rps_dev_flow *rflow, u16 next_cpu)
3451{
3452        if (next_cpu < nr_cpu_ids) {
3453#ifdef CONFIG_RFS_ACCEL
3454                struct netdev_rx_queue *rxqueue;
3455                struct rps_dev_flow_table *flow_table;
3456                struct rps_dev_flow *old_rflow;
3457                u32 flow_id;
3458                u16 rxq_index;
3459                int rc;
3460
3461                /* Should we steer this flow to a different hardware queue? */
3462                if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3463                    !(dev->features & NETIF_F_NTUPLE))
3464                        goto out;
3465                rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3466                if (rxq_index == skb_get_rx_queue(skb))
3467                        goto out;
3468
3469                rxqueue = dev->_rx + rxq_index;
3470                flow_table = rcu_dereference(rxqueue->rps_flow_table);
3471                if (!flow_table)
3472                        goto out;
3473                flow_id = skb_get_hash(skb) & flow_table->mask;
3474                rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3475                                                        rxq_index, flow_id);
3476                if (rc < 0)
3477                        goto out;
3478                old_rflow = rflow;
3479                rflow = &flow_table->flows[flow_id];
3480                rflow->filter = rc;
3481                if (old_rflow->filter == rflow->filter)
3482                        old_rflow->filter = RPS_NO_FILTER;
3483        out:
3484#endif
3485                rflow->last_qtail =
3486                        per_cpu(softnet_data, next_cpu).input_queue_head;
3487        }
3488
3489        rflow->cpu = next_cpu;
3490        return rflow;
3491}
3492
3493/*
3494 * get_rps_cpu is called from netif_receive_skb and returns the target
3495 * CPU from the RPS map of the receiving queue for a given skb.
3496 * rcu_read_lock must be held on entry.
3497 */
3498static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3499                       struct rps_dev_flow **rflowp)
3500{
3501        const struct rps_sock_flow_table *sock_flow_table;
3502        struct netdev_rx_queue *rxqueue = dev->_rx;
3503        struct rps_dev_flow_table *flow_table;
3504        struct rps_map *map;
3505        int cpu = -1;
3506        u32 tcpu;
3507        u32 hash;
3508
3509        if (skb_rx_queue_recorded(skb)) {
3510                u16 index = skb_get_rx_queue(skb);
3511
3512                if (unlikely(index >= dev->real_num_rx_queues)) {
3513                        WARN_ONCE(dev->real_num_rx_queues > 1,
3514                                  "%s received packet on queue %u, but number "
3515                                  "of RX queues is %u\n",
3516                                  dev->name, index, dev->real_num_rx_queues);
3517                        goto done;
3518                }
3519                rxqueue += index;
3520        }
3521
3522        /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3523
3524        flow_table = rcu_dereference(rxqueue->rps_flow_table);
3525        map = rcu_dereference(rxqueue->rps_map);
3526        if (!flow_table && !map)
3527                goto done;
3528
3529        skb_reset_network_header(skb);
3530        hash = skb_get_hash(skb);
3531        if (!hash)
3532                goto done;
3533
3534        sock_flow_table = rcu_dereference(rps_sock_flow_table);
3535        if (flow_table && sock_flow_table) {
3536                struct rps_dev_flow *rflow;
3537                u32 next_cpu;
3538                u32 ident;
3539
3540                /* First check into global flow table if there is a match */
3541                ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3542                if ((ident ^ hash) & ~rps_cpu_mask)
3543                        goto try_rps;
3544
3545                next_cpu = ident & rps_cpu_mask;
3546
3547                /* OK, now we know there is a match,
3548                 * we can look at the local (per receive queue) flow table
3549                 */
3550                rflow = &flow_table->flows[hash & flow_table->mask];
3551                tcpu = rflow->cpu;
3552
3553                /*
3554                 * If the desired CPU (where last recvmsg was done) is
3555                 * different from current CPU (one in the rx-queue flow
3556                 * table entry), switch if one of the following holds:
3557                 *   - Current CPU is unset (>= nr_cpu_ids).
3558                 *   - Current CPU is offline.
3559                 *   - The current CPU's queue tail has advanced beyond the
3560                 *     last packet that was enqueued using this table entry.
3561                 *     This guarantees that all previous packets for the flow
3562                 *     have been dequeued, thus preserving in order delivery.
3563                 */
3564                if (unlikely(tcpu != next_cpu) &&
3565                    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3566                     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3567                      rflow->last_qtail)) >= 0)) {
3568                        tcpu = next_cpu;
3569                        rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3570                }
3571
3572                if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3573                        *rflowp = rflow;
3574                        cpu = tcpu;
3575                        goto done;
3576                }
3577        }
3578
3579try_rps:
3580
3581        if (map) {
3582                tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3583                if (cpu_online(tcpu)) {
3584                        cpu = tcpu;
3585                        goto done;
3586                }
3587        }
3588
3589done:
3590        return cpu;
3591}
3592
3593#ifdef CONFIG_RFS_ACCEL
3594
3595/**
3596 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3597 * @dev: Device on which the filter was set
3598 * @rxq_index: RX queue index
3599 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3600 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3601 *
3602 * Drivers that implement ndo_rx_flow_steer() should periodically call
3603 * this function for each installed filter and remove the filters for
3604 * which it returns %true.
3605 */
3606bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3607                         u32 flow_id, u16 filter_id)
3608{
3609        struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3610        struct rps_dev_flow_table *flow_table;
3611        struct rps_dev_flow *rflow;
3612        bool expire = true;
3613        unsigned int cpu;
3614
3615        rcu_read_lock();
3616        flow_table = rcu_dereference(rxqueue->rps_flow_table);
3617        if (flow_table && flow_id <= flow_table->mask) {
3618                rflow = &flow_table->flows[flow_id];
3619                cpu = ACCESS_ONCE(rflow->cpu);
3620                if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3621                    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3622                           rflow->last_qtail) <
3623                     (int)(10 * flow_table->mask)))
3624                        expire = false;
3625        }
3626        rcu_read_unlock();
3627        return expire;
3628}
3629EXPORT_SYMBOL(rps_may_expire_flow);
3630
3631#endif /* CONFIG_RFS_ACCEL */
3632
3633/* Called from hardirq (IPI) context */
3634static void rps_trigger_softirq(void *data)
3635{
3636        struct softnet_data *sd = data;
3637
3638        ____napi_schedule(sd, &sd->backlog);
3639        sd->received_rps++;
3640}
3641
3642#endif /* CONFIG_RPS */
3643
3644/*
3645 * Check if this softnet_data structure is another cpu one
3646 * If yes, queue it to our IPI list and return 1
3647 * If no, return 0
3648 */
3649static int rps_ipi_queued(struct softnet_data *sd)
3650{
3651#ifdef CONFIG_RPS
3652        struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3653
3654        if (sd != mysd) {
3655                sd->rps_ipi_next = mysd->rps_ipi_list;
3656                mysd->rps_ipi_list = sd;
3657
3658                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3659                return 1;
3660        }
3661#endif /* CONFIG_RPS */
3662        return 0;
3663}
3664
3665#ifdef CONFIG_NET_FLOW_LIMIT
3666int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3667#endif
3668
3669static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3670{
3671#ifdef CONFIG_NET_FLOW_LIMIT
3672        struct sd_flow_limit *fl;
3673        struct softnet_data *sd;
3674        unsigned int old_flow, new_flow;
3675
3676        if (qlen < (netdev_max_backlog >> 1))
3677                return false;
3678
3679        sd = this_cpu_ptr(&softnet_data);
3680
3681        rcu_read_lock();
3682        fl = rcu_dereference(sd->flow_limit);
3683        if (fl) {
3684                new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3685                old_flow = fl->history[fl->history_head];
3686                fl->history[fl->history_head] = new_flow;
3687
3688                fl->history_head++;
3689                fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3690
3691                if (likely(fl->buckets[old_flow]))
3692                        fl->buckets[old_flow]--;
3693
3694                if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3695                        fl->count++;
3696                        rcu_read_unlock();
3697                        return true;
3698                }
3699        }
3700        rcu_read_unlock();
3701#endif
3702        return false;
3703}
3704
3705/*
3706 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3707 * queue (may be a remote CPU queue).
3708 */
3709static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3710                              unsigned int *qtail)
3711{
3712        struct softnet_data *sd;
3713        unsigned long flags;
3714        unsigned int qlen;
3715
3716        sd = &per_cpu(softnet_data, cpu);
3717
3718        local_irq_save(flags);
3719
3720        rps_lock(sd);
3721        if (!netif_running(skb->dev))
3722                goto drop;
3723        qlen = skb_queue_len(&sd->input_pkt_queue);
3724        if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3725                if (qlen) {
3726enqueue:
3727                        __skb_queue_tail(&sd->input_pkt_queue, skb);
3728                        input_queue_tail_incr_save(sd, qtail);
3729                        rps_unlock(sd);
3730                        local_irq_restore(flags);
3731                        return NET_RX_SUCCESS;
3732                }
3733
3734                /* Schedule NAPI for backlog device
3735                 * We can use non atomic operation since we own the queue lock
3736                 */
3737                if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3738                        if (!rps_ipi_queued(sd))
3739                                ____napi_schedule(sd, &sd->backlog);
3740                }
3741                goto enqueue;
3742        }
3743
3744drop:
3745        sd->dropped++;
3746        rps_unlock(sd);
3747
3748        local_irq_restore(flags);
3749
3750        atomic_long_inc(&skb->dev->rx_dropped);
3751        kfree_skb(skb);
3752        return NET_RX_DROP;
3753}
3754
3755static int netif_rx_internal(struct sk_buff *skb)
3756{
3757        int ret;
3758
3759        net_timestamp_check(netdev_tstamp_prequeue, skb);
3760
3761        trace_netif_rx(skb);
3762#ifdef CONFIG_RPS
3763        if (static_key_false(&rps_needed)) {
3764                struct rps_dev_flow voidflow, *rflow = &voidflow;
3765                int cpu;
3766
3767                preempt_disable();
3768                rcu_read_lock();
3769
3770                cpu = get_rps_cpu(skb->dev, skb, &rflow);
3771                if (cpu < 0)
3772                        cpu = smp_processor_id();
3773
3774                ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3775
3776                rcu_read_unlock();
3777                preempt_enable();
3778        } else
3779#endif
3780        {
3781                unsigned int qtail;
3782                ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3783                put_cpu();
3784        }
3785        return ret;
3786}
3787
3788/**
3789 *      netif_rx        -       post buffer to the network code
3790 *      @skb: buffer to post
3791 *
3792 *      This function receives a packet from a device driver and queues it for
3793 *      the upper (protocol) levels to process.  It always succeeds. The buffer
3794 *      may be dropped during processing for congestion control or by the
3795 *      protocol layers.
3796 *
3797 *      return values:
3798 *      NET_RX_SUCCESS  (no congestion)
3799 *      NET_RX_DROP     (packet was dropped)
3800 *
3801 */
3802
3803int netif_rx(struct sk_buff *skb)
3804{
3805        trace_netif_rx_entry(skb);
3806
3807        return netif_rx_internal(skb);
3808}
3809EXPORT_SYMBOL(netif_rx);
3810
3811int netif_rx_ni(struct sk_buff *skb)
3812{
3813        int err;
3814
3815        trace_netif_rx_ni_entry(skb);
3816
3817        preempt_disable();
3818        err = netif_rx_internal(skb);
3819        if (local_softirq_pending())
3820                do_softirq();
3821        preempt_enable();
3822
3823        return err;
3824}
3825EXPORT_SYMBOL(netif_rx_ni);
3826
3827static __latent_entropy void net_tx_action(struct softirq_action *h)
3828{
3829        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3830
3831        if (sd->completion_queue) {
3832                struct sk_buff *clist;
3833
3834                local_irq_disable();
3835                clist = sd->completion_queue;
3836                sd->completion_queue = NULL;
3837                local_irq_enable();
3838
3839                while (clist) {
3840                        struct sk_buff *skb = clist;
3841                        clist = clist->next;
3842
3843                        WARN_ON(atomic_read(&skb->users));
3844                        if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3845                                trace_consume_skb(skb);
3846                        else
3847                                trace_kfree_skb(skb, net_tx_action);
3848
3849                        if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3850                                __kfree_skb(skb);
3851                        else
3852                                __kfree_skb_defer(skb);
3853                }
3854
3855                __kfree_skb_flush();
3856        }
3857
3858        if (sd->output_queue) {
3859                struct Qdisc *head;
3860
3861                local_irq_disable();
3862                head = sd->output_queue;
3863                sd->output_queue = NULL;
3864                sd->output_queue_tailp = &sd->output_queue;
3865                local_irq_enable();
3866
3867                while (head) {
3868                        struct Qdisc *q = head;
3869                        spinlock_t *root_lock;
3870
3871                        head = head->next_sched;
3872
3873                        root_lock = qdisc_lock(q);
3874                        spin_lock(root_lock);
3875                        /* We need to make sure head->next_sched is read
3876                         * before clearing __QDISC_STATE_SCHED
3877                         */
3878                        smp_mb__before_atomic();
3879                        clear_bit(__QDISC_STATE_SCHED, &q->state);
3880                        qdisc_run(q);
3881                        spin_unlock(root_lock);
3882                }
3883        }
3884}
3885
3886#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
3887/* This hook is defined here for ATM LANE */
3888int (*br_fdb_test_addr_hook)(struct net_device *dev,
3889                             unsigned char *addr) __read_mostly;
3890EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3891#endif
3892
3893static inline struct sk_buff *
3894sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3895                   struct net_device *orig_dev)
3896{
3897#ifdef CONFIG_NET_CLS_ACT
3898        struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3899        struct tcf_result cl_res;
3900
3901        /* If there's at least one ingress present somewhere (so
3902         * we get here via enabled static key), remaining devices
3903         * that are not configured with an ingress qdisc will bail
3904         * out here.
3905         */
3906        if (!cl)
3907                return skb;
3908        if (*pt_prev) {
3909                *ret = deliver_skb(skb, *pt_prev, orig_dev);
3910                *pt_prev = NULL;
3911        }
3912
3913        qdisc_skb_cb(skb)->pkt_len = skb->len;
3914        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3915        qdisc_bstats_cpu_update(cl->q, skb);
3916
3917        switch (tc_classify(skb, cl, &cl_res, false)) {
3918        case TC_ACT_OK:
3919        case TC_ACT_RECLASSIFY:
3920                skb->tc_index = TC_H_MIN(cl_res.classid);
3921                break;
3922        case TC_ACT_SHOT:
3923                qdisc_qstats_cpu_drop(cl->q);
3924                kfree_skb(skb);
3925                return NULL;
3926        case TC_ACT_STOLEN:
3927        case TC_ACT_QUEUED:
3928                consume_skb(skb);
3929                return NULL;
3930        case TC_ACT_REDIRECT:
3931                /* skb_mac_header check was done by cls/act_bpf, so
3932                 * we can safely push the L2 header back before
3933                 * redirecting to another netdev
3934                 */
3935                __skb_push(skb, skb->mac_len);
3936                skb_do_redirect(skb);
3937                return NULL;
3938        default:
3939                break;
3940        }
3941#endif /* CONFIG_NET_CLS_ACT */
3942        return skb;
3943}
3944
3945/**
3946 *      netdev_is_rx_handler_busy - check if receive handler is registered
3947 *      @dev: device to check
3948 *
3949 *      Check if a receive handler is already registered for a given device.
3950 *      Return true if there one.
3951 *
3952 *      The caller must hold the rtnl_mutex.
3953 */
3954bool netdev_is_rx_handler_busy(struct net_device *dev)
3955{
3956        ASSERT_RTNL();
3957        return dev && rtnl_dereference(dev->rx_handler);
3958}
3959EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3960
3961/**
3962 *      netdev_rx_handler_register - register receive handler
3963 *      @dev: device to register a handler for
3964 *      @rx_handler: receive handler to register
3965 *      @rx_handler_data: data pointer that is used by rx handler
3966 *
3967 *      Register a receive handler for a device. This handler will then be
3968 *      called from __netif_receive_skb. A negative errno code is returned
3969 *      on a failure.
3970 *
3971 *      The caller must hold the rtnl_mutex.
3972 *
3973 *      For a general description of rx_handler, see enum rx_handler_result.
3974 */
3975int netdev_rx_handler_register(struct net_device *dev,
3976                               rx_handler_func_t *rx_handler,
3977                               void *rx_handler_data)
3978{
3979        ASSERT_RTNL();
3980
3981        if (dev->rx_handler)
3982                return -EBUSY;
3983
3984        /* Note: rx_handler_data must be set before rx_handler */
3985        rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3986        rcu_assign_pointer(dev->rx_handler, rx_handler);
3987
3988        return 0;
3989}
3990EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3991
3992/**
3993 *      netdev_rx_handler_unregister - unregister receive handler
3994 *      @dev: device to unregister a handler from
3995 *
3996 *      Unregister a receive handler from a device.
3997 *
3998 *      The caller must hold the rtnl_mutex.
3999 */
4000void netdev_rx_handler_unregister(struct net_device *dev)
4001{
4002
4003        ASSERT_RTNL();
4004        RCU_INIT_POINTER(dev->rx_handler, NULL);
4005        /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4006         * section has a guarantee to see a non NULL rx_handler_data
4007         * as well.
4008         */
4009        synchronize_net();
4010        RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4011}
4012EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4013
4014/*
4015 * Limit the use of PFMEMALLOC reserves to those protocols that implement
4016 * the special handling of PFMEMALLOC skbs.
4017 */
4018static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4019{
4020        switch (skb->protocol) {
4021        case htons(ETH_P_ARP):
4022        case htons(ETH_P_IP):
4023        case htons(ETH_P_IPV6):
4024        case htons(ETH_P_8021Q):
4025        case htons(ETH_P_8021AD):
4026                return true;
4027        default:
4028                return false;
4029        }
4030}
4031
4032static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4033                             int *ret, struct net_device *orig_dev)
4034{
4035#ifdef CONFIG_NETFILTER_INGRESS
4036        if (nf_hook_ingress_active(skb)) {
4037                int ingress_retval;
4038
4039                if (*pt_prev) {
4040                        *ret = deliver_skb(skb, *pt_prev, orig_dev);
4041                        *pt_prev = NULL;
4042                }
4043
4044                rcu_read_lock();
4045                ingress_retval = nf_hook_ingress(skb);
4046                rcu_read_unlock();
4047                return ingress_retval;
4048        }
4049#endif /* CONFIG_NETFILTER_INGRESS */
4050        return 0;
4051}
4052
4053static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4054{
4055        struct packet_type *ptype, *pt_prev;
4056        rx_handler_func_t *rx_handler;
4057        struct net_device *orig_dev;
4058        bool deliver_exact = false;
4059        int ret = NET_RX_DROP;
4060        __be16 type;
4061
4062        net_timestamp_check(!netdev_tstamp_prequeue, skb);
4063
4064        trace_netif_receive_skb(skb);
4065
4066        orig_dev = skb->dev;
4067
4068        skb_reset_network_header(skb);
4069        if (!skb_transport_header_was_set(skb))
4070                skb_reset_transport_header(skb);
4071        skb_reset_mac_len(skb);
4072
4073        pt_prev = NULL;
4074
4075another_round:
4076        skb->skb_iif = skb->dev->ifindex;
4077
4078        __this_cpu_inc(softnet_data.processed);
4079
4080        if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4081            skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4082                skb = skb_vlan_untag(skb);
4083                if (unlikely(!skb))
4084                        goto out;
4085        }
4086
4087#ifdef CONFIG_NET_CLS_ACT
4088        if (skb->tc_verd & TC_NCLS) {
4089                skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4090                goto ncls;
4091        }
4092#endif
4093
4094        if (pfmemalloc)
4095                goto skip_taps;
4096
4097        list_for_each_entry_rcu(ptype, &ptype_all, list) {
4098                if (pt_prev)
4099                        ret = deliver_skb(skb, pt_prev, orig_dev);
4100                pt_prev = ptype;
4101        }
4102
4103        list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4104                if (pt_prev)
4105                        ret = deliver_skb(skb, pt_prev, orig_dev);
4106                pt_prev = ptype;
4107        }
4108
4109skip_taps:
4110#ifdef CONFIG_NET_INGRESS
4111        if (static_key_false(&ingress_needed)) {
4112                skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4113                if (!skb)
4114                        goto out;
4115
4116                if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4117                        goto out;
4118        }
4119#endif
4120#ifdef CONFIG_NET_CLS_ACT
4121        skb->tc_verd = 0;
4122ncls:
4123#endif
4124        if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4125                goto drop;
4126
4127        if (skb_vlan_tag_present(skb)) {
4128                if (pt_prev) {
4129                        ret = deliver_skb(skb, pt_prev, orig_dev);
4130                        pt_prev = NULL;
4131                }
4132                if (vlan_do_receive(&skb))
4133                        goto another_round;
4134                else if (unlikely(!skb))
4135                        goto out;
4136        }
4137
4138        rx_handler = rcu_dereference(skb->dev->rx_handler);
4139        if (rx_handler) {
4140                if (pt_prev) {
4141                        ret = deliver_skb(skb, pt_prev, orig_dev);
4142                        pt_prev = NULL;
4143                }
4144                switch (rx_handler(&skb)) {
4145                case RX_HANDLER_CONSUMED:
4146                        ret = NET_RX_SUCCESS;
4147                        goto out;
4148                case RX_HANDLER_ANOTHER:
4149                        goto another_round;
4150                case RX_HANDLER_EXACT:
4151                        deliver_exact = true;
4152                case RX_HANDLER_PASS:
4153                        break;
4154                default:
4155                        BUG();
4156                }
4157        }
4158
4159        if (unlikely(skb_vlan_tag_present(skb))) {
4160                if (skb_vlan_tag_get_id(skb))
4161                        skb->pkt_type = PACKET_OTHERHOST;
4162                /* Note: we might in the future use prio bits
4163                 * and set skb->priority like in vlan_do_receive()
4164                 * For the time being, just ignore Priority Code Point
4165                 */
4166                skb->vlan_tci = 0;
4167        }
4168
4169        type = skb->protocol;
4170
4171        /* deliver only exact match when indicated */
4172        if (likely(!deliver_exact)) {
4173                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4174                                       &ptype_base[ntohs(type) &
4175                                                   PTYPE_HASH_MASK]);
4176        }
4177
4178        deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4179                               &orig_dev->ptype_specific);
4180
4181        if (unlikely(skb->dev != orig_dev)) {
4182                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4183                                       &skb->dev->ptype_specific);
4184        }
4185
4186        if (pt_prev) {
4187                if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4188                        goto drop;
4189                else
4190                        ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4191        } else {
4192drop:
4193                if (!deliver_exact)
4194                        atomic_long_inc(&skb->dev->rx_dropped);
4195                else
4196                        atomic_long_inc(&skb->dev->rx_nohandler);
4197                kfree_skb(skb);
4198                /* Jamal, now you will not able to escape explaining
4199                 * me how you were going to use this. :-)
4200                 */
4201                ret = NET_RX_DROP;
4202        }
4203
4204out:
4205        return ret;
4206}
4207
4208static int __netif_receive_skb(struct sk_buff *skb)
4209{
4210        int ret;
4211
4212        if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4213                unsigned long pflags = current->flags;
4214
4215                /*
4216                 * PFMEMALLOC skbs are special, they should
4217                 * - be delivered to SOCK_MEMALLOC sockets only
4218                 * - stay away from userspace
4219                 * - have bounded memory usage
4220                 *
4221                 * Use PF_MEMALLOC as this saves us from propagating the allocation
4222                 * context down to all allocation sites.
4223                 */
4224                current->flags |= PF_MEMALLOC;
4225                ret = __netif_receive_skb_core(skb, true);
4226                tsk_restore_flags(current, pflags, PF_MEMALLOC);
4227        } else
4228                ret = __netif_receive_skb_core(skb, false);
4229
4230        return ret;
4231}
4232
4233static int netif_receive_skb_internal(struct sk_buff *skb)
4234{
4235        int ret;
4236
4237        net_timestamp_check(netdev_tstamp_prequeue, skb);
4238
4239        if (skb_defer_rx_timestamp(skb))
4240                return NET_RX_SUCCESS;
4241
4242        rcu_read_lock();
4243
4244#ifdef CONFIG_RPS
4245        if (static_key_false(&rps_needed)) {
4246                struct rps_dev_flow voidflow, *rflow = &voidflow;
4247                int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4248
4249                if (cpu >= 0) {
4250                        ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4251                        rcu_read_unlock();
4252                        return ret;
4253                }
4254        }
4255#endif
4256        ret = __netif_receive_skb(skb);
4257        rcu_read_unlock();
4258        return ret;
4259}
4260
4261/**
4262 *      netif_receive_skb - process receive buffer from network
4263 *      @skb: buffer to process
4264 *
4265 *      netif_receive_skb() is the main receive data processing function.
4266 *      It always succeeds. The buffer may be dropped during processing
4267 *      for congestion control or by the protocol layers.
4268 *
4269 *      This function may only be called from softirq context and interrupts
4270 *      should be enabled.
4271 *
4272 *      Return values (usually ignored):
4273 *      NET_RX_SUCCESS: no congestion
4274 *      NET_RX_DROP: packet was dropped
4275 */
4276int netif_receive_skb(struct sk_buff *skb)
4277{
4278        trace_netif_receive_skb_entry(skb);
4279
4280        return netif_receive_skb_internal(skb);
4281}
4282EXPORT_SYMBOL(netif_receive_skb);
4283
4284DEFINE_PER_CPU(struct work_struct, flush_works);
4285
4286/* Network device is going away, flush any packets still pending */
4287static void flush_backlog(struct work_struct *work)
4288{
4289        struct sk_buff *skb, *tmp;
4290        struct softnet_data *sd;
4291
4292        local_bh_disable();
4293        sd = this_cpu_ptr(&softnet_data);
4294
4295        local_irq_disable();
4296        rps_lock(sd);
4297        skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4298                if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4299                        __skb_unlink(skb, &sd->input_pkt_queue);
4300                        kfree_skb(skb);
4301                        input_queue_head_incr(sd);
4302                }
4303        }
4304        rps_unlock(sd);
4305        local_irq_enable();
4306
4307        skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4308                if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4309                        __skb_unlink(skb, &sd->process_queue);
4310                        kfree_skb(skb);
4311                        input_queue_head_incr(sd);
4312                }
4313        }
4314        local_bh_enable();
4315}
4316
4317static void flush_all_backlogs(void)
4318{
4319        unsigned int cpu;
4320
4321        get_online_cpus();
4322
4323        for_each_online_cpu(cpu)
4324                queue_work_on(cpu, system_highpri_wq,
4325                              per_cpu_ptr(&flush_works, cpu));
4326
4327        for_each_online_cpu(cpu)
4328                flush_work(per_cpu_ptr(&flush_works, cpu));
4329
4330        put_online_cpus();
4331}
4332
4333static int napi_gro_complete(struct sk_buff *skb)
4334{
4335        struct packet_offload *ptype;
4336        __be16 type = skb->protocol;
4337        struct list_head *head = &offload_base;
4338        int err = -ENOENT;
4339
4340        BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4341
4342        if (NAPI_GRO_CB(skb)->count == 1) {
4343                skb_shinfo(skb)->gso_size = 0;
4344                goto out;
4345        }
4346
4347        rcu_read_lock();
4348        list_for_each_entry_rcu(ptype, head, list) {
4349                if (ptype->type != type || !ptype->callbacks.gro_complete)
4350                        continue;
4351
4352                err = ptype->callbacks.gro_complete(skb, 0);
4353                break;
4354        }
4355        rcu_read_unlock();
4356
4357        if (err) {
4358                WARN_ON(&ptype->list == head);
4359                kfree_skb(skb);
4360                return NET_RX_SUCCESS;
4361        }
4362
4363out:
4364        return netif_receive_skb_internal(skb);
4365}
4366
4367/* napi->gro_list contains packets ordered by age.
4368 * youngest packets at the head of it.
4369 * Complete skbs in reverse order to reduce latencies.
4370 */
4371void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4372{
4373        struct sk_buff *skb, *prev = NULL;
4374
4375        /* scan list and build reverse chain */
4376        for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4377                skb->prev = prev;
4378                prev = skb;
4379        }
4380
4381        for (skb = prev; skb; skb = prev) {
4382                skb->next = NULL;
4383
4384                if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4385                        return;
4386
4387                prev = skb->prev;
4388                napi_gro_complete(skb);
4389                napi->gro_count--;
4390        }
4391
4392        napi->gro_list = NULL;
4393}
4394EXPORT_SYMBOL(napi_gro_flush);
4395
4396static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4397{
4398        struct sk_buff *p;
4399        unsigned int maclen = skb->dev->hard_header_len;
4400        u32 hash = skb_get_hash_raw(skb);
4401
4402        for (p = napi->gro_list; p; p = p->next) {
4403                unsigned long diffs;
4404
4405                NAPI_GRO_CB(p)->flush = 0;
4406
4407                if (hash != skb_get_hash_raw(p)) {
4408                        NAPI_GRO_CB(p)->same_flow = 0;
4409                        continue;
4410                }
4411
4412                diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4413                diffs |= p->vlan_tci ^ skb->vlan_tci;
4414                diffs |= skb_metadata_dst_cmp(p, skb);
4415                if (maclen == ETH_HLEN)
4416                        diffs |= compare_ether_header(skb_mac_header(p),
4417                                                      skb_mac_header(skb));
4418                else if (!diffs)
4419                        diffs = memcmp(skb_mac_header(p),
4420                                       skb_mac_header(skb),
4421                                       maclen);
4422                NAPI_GRO_CB(p)->same_flow = !diffs;
4423        }
4424}
4425
4426static void skb_gro_reset_offset(struct sk_buff *skb)
4427{
4428        const struct skb_shared_info *pinfo = skb_shinfo(skb);
4429        const skb_frag_t *frag0 = &pinfo->frags[0];
4430
4431        NAPI_GRO_CB(skb)->data_offset = 0;
4432        NAPI_GRO_CB(skb)->frag0 = NULL;
4433        NAPI_GRO_CB(skb)->frag0_len = 0;
4434
4435        if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4436            pinfo->nr_frags &&
4437            !PageHighMem(skb_frag_page(frag0))) {
4438                NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4439                NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4440                                                    skb_frag_size(frag0),
4441                                                    skb->end - skb->tail);
4442        }
4443}
4444
4445static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4446{
4447        struct skb_shared_info *pinfo = skb_shinfo(skb);
4448
4449        BUG_ON(skb->end - skb->tail < grow);
4450
4451        memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4452
4453        skb->data_len -= grow;
4454        skb->tail += grow;
4455
4456        pinfo->frags[0].page_offset += grow;
4457        skb_frag_size_sub(&pinfo->frags[0], grow);
4458
4459        if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4460                skb_frag_unref(skb, 0);
4461                memmove(pinfo->frags, pinfo->frags + 1,
4462                        --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4463        }
4464}
4465
4466static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4467{
4468        struct sk_buff **pp = NULL;
4469        struct packet_offload *ptype;
4470        __be16 type = skb->protocol;
4471        struct list_head *head = &offload_base;
4472        int same_flow;
4473        enum gro_result ret;
4474        int grow;
4475
4476        if (!(skb->dev->features & NETIF_F_GRO))
4477                goto normal;
4478
4479        if (skb->csum_bad)
4480                goto normal;
4481
4482        gro_list_prepare(napi, skb);
4483
4484        rcu_read_lock();
4485        list_for_each_entry_rcu(ptype, head, list) {
4486                if (ptype->type != type || !ptype->callbacks.gro_receive)
4487                        continue;
4488
4489                skb_set_network_header(skb, skb_gro_offset(skb));
4490                skb_reset_mac_len(skb);
4491                NAPI_GRO_CB(skb)->same_flow = 0;
4492                NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
4493                NAPI_GRO_CB(skb)->free = 0;
4494                NAPI_GRO_CB(skb)->encap_mark = 0;
4495                NAPI_GRO_CB(skb)->recursion_counter = 0;
4496                NAPI_GRO_CB(skb)->is_fou = 0;
4497                NAPI_GRO_CB(skb)->is_atomic = 1;
4498                NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4499
4500                /* Setup for GRO checksum validation */
4501                switch (skb->ip_summed) {
4502                case CHECKSUM_COMPLETE:
4503                        NAPI_GRO_CB(skb)->csum = skb->csum;
4504                        NAPI_GRO_CB(skb)->csum_valid = 1;
4505                        NAPI_GRO_CB(skb)->csum_cnt = 0;
4506                        break;
4507                case CHECKSUM_UNNECESSARY:
4508                        NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4509                        NAPI_GRO_CB(skb)->csum_valid = 0;
4510                        break;
4511                default:
4512                        NAPI_GRO_CB(skb)->csum_cnt = 0;
4513                        NAPI_GRO_CB(skb)->csum_valid = 0;
4514                }
4515
4516                pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4517                break;
4518        }
4519        rcu_read_unlock();
4520
4521        if (&ptype->list == head)
4522                goto normal;
4523
4524        same_flow = NAPI_GRO_CB(skb)->same_flow;
4525        ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4526
4527        if (pp) {
4528                struct sk_buff *nskb = *pp;
4529
4530                *pp = nskb->next;
4531                nskb->next = NULL;
4532                napi_gro_complete(nskb);
4533                napi->gro_count--;
4534        }
4535
4536        if (same_flow)
4537                goto ok;
4538
4539        if (NAPI_GRO_CB(skb)->flush)
4540                goto normal;
4541
4542        if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4543                struct sk_buff *nskb = napi->gro_list;
4544
4545                /* locate the end of the list to select the 'oldest' flow */
4546                while (nskb->next) {
4547                        pp = &nskb->next;
4548                        nskb = *pp;
4549                }
4550                *pp = NULL;
4551                nskb->next = NULL;
4552                napi_gro_complete(nskb);
4553        } else {
4554                napi->gro_count++;
4555        }
4556        NAPI_GRO_CB(skb)->count = 1;
4557        NAPI_GRO_CB(skb)->age = jiffies;
4558        NAPI_GRO_CB(skb)->last = skb;
4559        skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4560        skb->next = napi->gro_list;
4561        napi->gro_list = skb;
4562        ret = GRO_HELD;
4563
4564pull:
4565        grow = skb_gro_offset(skb) - skb_headlen(skb);
4566        if (grow > 0)
4567                gro_pull_from_frag0(skb, grow);
4568ok:
4569        return ret;
4570
4571normal:
4572        ret = GRO_NORMAL;
4573        goto pull;
4574}
4575
4576struct packet_offload *gro_find_receive_by_type(__be16 type)
4577{
4578        struct list_head *offload_head = &offload_base;
4579        struct packet_offload *ptype;
4580
4581        list_for_each_entry_rcu(ptype, offload_head, list) {
4582                if (ptype->type != type || !ptype->callbacks.gro_receive)
4583                        continue;
4584                return ptype;
4585        }
4586        return NULL;
4587}
4588EXPORT_SYMBOL(gro_find_receive_by_type);
4589
4590struct packet_offload *gro_find_complete_by_type(__be16 type)
4591{
4592        struct list_head *offload_head = &offload_base;
4593        struct packet_offload *ptype;
4594
4595        list_for_each_entry_rcu(ptype, offload_head, list) {
4596                if (ptype->type != type || !ptype->callbacks.gro_complete)
4597                        continue;
4598                return ptype;
4599        }
4600        return NULL;
4601}
4602EXPORT_SYMBOL(gro_find_complete_by_type);
4603
4604static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4605{
4606        switch (ret) {
4607        case GRO_NORMAL:
4608                if (netif_receive_skb_internal(skb))
4609                        ret = GRO_DROP;
4610                break;
4611
4612        case GRO_DROP:
4613                kfree_skb(skb);
4614                break;
4615
4616        case GRO_MERGED_FREE:
4617                if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4618                        skb_dst_drop(skb);
4619                        kmem_cache_free(skbuff_head_cache, skb);
4620                } else {
4621                        __kfree_skb(skb);
4622                }
4623                break;
4624
4625        case GRO_HELD:
4626        case GRO_MERGED:
4627                break;
4628        }
4629
4630        return ret;
4631}
4632
4633gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4634{
4635        skb_mark_napi_id(skb, napi);
4636        trace_napi_gro_receive_entry(skb);
4637
4638        skb_gro_reset_offset(skb);
4639
4640        return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4641}
4642EXPORT_SYMBOL(napi_gro_receive);
4643
4644static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4645{
4646        if (unlikely(skb->pfmemalloc)) {
4647                consume_skb(skb);
4648                return;
4649        }
4650        __skb_pull(skb, skb_headlen(skb));
4651        /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4652        skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4653        skb->vlan_tci = 0;
4654        skb->dev = napi->dev;
4655        skb->skb_iif = 0;
4656        skb->encapsulation = 0;
4657        skb_shinfo(skb)->gso_type = 0;
4658        skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4659
4660        napi->skb = skb;
4661}
4662
4663struct sk_buff *napi_get_frags(struct napi_struct *napi)
4664{
4665        struct sk_buff *skb = napi->skb;
4666
4667        if (!skb) {
4668                skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4669                if (skb) {
4670                        napi->skb = skb;
4671                        skb_mark_napi_id(skb, napi);
4672                }
4673        }
4674        return skb;
4675}
4676EXPORT_SYMBOL(napi_get_frags);
4677
4678static gro_result_t napi_frags_finish(struct napi_struct *napi,
4679                                      struct sk_buff *skb,
4680                                      gro_result_t ret)
4681{
4682        switch (ret) {
4683        case GRO_NORMAL:
4684        case GRO_HELD:
4685                __skb_push(skb, ETH_HLEN);
4686                skb->protocol = eth_type_trans(skb, skb->dev);
4687                if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4688                        ret = GRO_DROP;
4689                break;
4690
4691        case GRO_DROP:
4692        case GRO_MERGED_FREE:
4693                napi_reuse_skb(napi, skb);
4694                break;
4695
4696        case GRO_MERGED:
4697                break;
4698        }
4699
4700        return ret;
4701}
4702
4703/* Upper GRO stack assumes network header starts at gro_offset=0
4704 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4705 * We copy ethernet header into skb->data to have a common layout.
4706 */
4707static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4708{
4709        struct sk_buff *skb = napi->skb;
4710        const struct ethhdr *eth;
4711        unsigned int hlen = sizeof(*eth);
4712
4713        napi->skb = NULL;
4714
4715        skb_reset_mac_header(skb);
4716        skb_gro_reset_offset(skb);
4717
4718        eth = skb_gro_header_fast(skb, 0);
4719        if (unlikely(skb_gro_header_hard(skb, hlen))) {
4720                eth = skb_gro_header_slow(skb, hlen, 0);
4721                if (unlikely(!eth)) {
4722                        net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4723                                             __func__, napi->dev->name);
4724                        napi_reuse_skb(napi, skb);
4725                        return NULL;
4726                }
4727        } else {
4728                gro_pull_from_frag0(skb, hlen);
4729                NAPI_GRO_CB(skb)->frag0 += hlen;
4730                NAPI_GRO_CB(skb)->frag0_len -= hlen;
4731        }
4732        __skb_pull(skb, hlen);
4733
4734        /*
4735         * This works because the only protocols we care about don't require
4736         * special handling.
4737         * We'll fix it up properly in napi_frags_finish()
4738         */
4739        skb->protocol = eth->h_proto;
4740
4741        return skb;
4742}
4743
4744gro_result_t napi_gro_frags(struct napi_struct *napi)
4745{
4746        struct sk_buff *skb = napi_frags_skb(napi);
4747
4748        if (!skb)
4749                return GRO_DROP;
4750
4751        trace_napi_gro_frags_entry(skb);
4752
4753        return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4754}
4755EXPORT_SYMBOL(napi_gro_frags);
4756
4757/* Compute the checksum from gro_offset and return the folded value
4758 * after adding in any pseudo checksum.
4759 */
4760__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4761{
4762        __wsum wsum;
4763        __sum16 sum;
4764
4765        wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4766
4767        /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4768        sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4769        if (likely(!sum)) {
4770                if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4771                    !skb->csum_complete_sw)
4772                        netdev_rx_csum_fault(skb->dev);
4773        }
4774
4775        NAPI_GRO_CB(skb)->csum = wsum;
4776        NAPI_GRO_CB(skb)->csum_valid = 1;
4777
4778        return sum;
4779}
4780EXPORT_SYMBOL(__skb_gro_checksum_complete);
4781
4782/*
4783 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4784 * Note: called with local irq disabled, but exits with local irq enabled.
4785 */
4786static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4787{
4788#ifdef CONFIG_RPS
4789        struct softnet_data *remsd = sd->rps_ipi_list;
4790
4791        if (remsd) {
4792                sd->rps_ipi_list = NULL;
4793
4794                local_irq_enable();
4795
4796                /* Send pending IPI's to kick RPS processing on remote cpus. */
4797                while (remsd) {
4798                        struct softnet_data *next = remsd->rps_ipi_next;
4799
4800                        if (cpu_online(remsd->cpu))
4801                                smp_call_function_single_async(remsd->cpu,
4802                                                           &remsd->csd);
4803                        remsd = next;
4804                }
4805        } else
4806#endif
4807                local_irq_enable();
4808}
4809
4810static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4811{
4812#ifdef CONFIG_RPS
4813        return sd->rps_ipi_list != NULL;
4814#else
4815        return false;
4816#endif
4817}
4818
4819static int process_backlog(struct napi_struct *napi, int quota)
4820{
4821        struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4822        bool again = true;
4823        int work = 0;
4824
4825        /* Check if we have pending ipi, its better to send them now,
4826         * not waiting net_rx_action() end.
4827         */
4828        if (sd_has_rps_ipi_waiting(sd)) {
4829                local_irq_disable();
4830                net_rps_action_and_irq_enable(sd);
4831        }
4832
4833        napi->weight = weight_p;
4834        while (again) {
4835                struct sk_buff *skb;
4836
4837                while ((skb = __skb_dequeue(&sd->process_queue))) {
4838                        rcu_read_lock();
4839                        __netif_receive_skb(skb);
4840                        rcu_read_unlock();
4841                        input_queue_head_incr(sd);
4842                        if (++work >= quota)
4843                                return work;
4844
4845                }
4846
4847                local_irq_disable();
4848                rps_lock(sd);
4849                if (skb_queue_empty(&sd->input_pkt_queue)) {
4850                        /*
4851                         * Inline a custom version of __napi_complete().
4852                         * only current cpu owns and manipulates this napi,
4853                         * and NAPI_STATE_SCHED is the only possible flag set
4854                         * on backlog.
4855                         * We can use a plain write instead of clear_bit(),
4856                         * and we dont need an smp_mb() memory barrier.
4857                         */
4858                        napi->state = 0;
4859                        again = false;
4860                } else {
4861                        skb_queue_splice_tail_init(&sd->input_pkt_queue,
4862                                                   &sd->process_queue);
4863                }
4864                rps_unlock(sd);
4865                local_irq_enable();
4866        }
4867
4868        return work;
4869}
4870
4871/**
4872 * __napi_schedule - schedule for receive
4873 * @n: entry to schedule
4874 *
4875 * The entry's receive function will be scheduled to run.
4876 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4877 */
4878void __napi_schedule(struct napi_struct *n)
4879{
4880        unsigned long flags;
4881
4882        local_irq_save(flags);
4883        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4884        local_irq_restore(flags);
4885}
4886EXPORT_SYMBOL(__napi_schedule);
4887
4888/**
4889 * __napi_schedule_irqoff - schedule for receive
4890 * @n: entry to schedule
4891 *
4892 * Variant of __napi_schedule() assuming hard irqs are masked
4893 */
4894void __napi_schedule_irqoff(struct napi_struct *n)
4895{
4896        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4897}
4898EXPORT_SYMBOL(__napi_schedule_irqoff);
4899
4900bool __napi_complete(struct napi_struct *n)
4901{
4902        BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4903
4904        /* Some drivers call us directly, instead of calling
4905         * napi_complete_done().
4906         */
4907        if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state)))
4908                return false;
4909
4910        list_del_init(&n->poll_list);
4911        smp_mb__before_atomic();
4912        clear_bit(NAPI_STATE_SCHED, &n->state);
4913        return true;
4914}
4915EXPORT_SYMBOL(__napi_complete);
4916
4917bool napi_complete_done(struct napi_struct *n, int work_done)
4918{
4919        unsigned long flags;
4920
4921        /*
4922         * 1) Don't let napi dequeue from the cpu poll list
4923         *    just in case its running on a different cpu.
4924         * 2) If we are busy polling, do nothing here, we have
4925         *    the guarantee we will be called later.
4926         */
4927        if (unlikely(n->state & (NAPIF_STATE_NPSVC |
4928                                 NAPIF_STATE_IN_BUSY_POLL)))
4929                return false;
4930
4931        if (n->gro_list) {
4932                unsigned long timeout = 0;
4933
4934                if (work_done)
4935                        timeout = n->dev->gro_flush_timeout;
4936
4937                if (timeout)
4938                        hrtimer_start(&n->timer, ns_to_ktime(timeout),
4939                                      HRTIMER_MODE_REL_PINNED);
4940                else
4941                        napi_gro_flush(n, false);
4942        }
4943        if (likely(list_empty(&n->poll_list))) {
4944                WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4945        } else {
4946                /* If n->poll_list is not empty, we need to mask irqs */
4947                local_irq_save(flags);
4948                __napi_complete(n);
4949                local_irq_restore(flags);
4950        }
4951        return true;
4952}
4953EXPORT_SYMBOL(napi_complete_done);
4954
4955/* must be called under rcu_read_lock(), as we dont take a reference */
4956static struct napi_struct *napi_by_id(unsigned int napi_id)
4957{
4958        unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4959        struct napi_struct *napi;
4960
4961        hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4962                if (napi->napi_id == napi_id)
4963                        return napi;
4964
4965        return NULL;
4966}
4967
4968#if defined(CONFIG_NET_RX_BUSY_POLL)
4969
4970#define BUSY_POLL_BUDGET 8
4971
4972static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
4973{
4974        int rc;
4975
4976        clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
4977
4978        local_bh_disable();
4979
4980        /* All we really want here is to re-enable device interrupts.
4981         * Ideally, a new ndo_busy_poll_stop() could avoid another round.
4982         */
4983        rc = napi->poll(napi, BUSY_POLL_BUDGET);
4984        netpoll_poll_unlock(have_poll_lock);
4985        if (rc == BUSY_POLL_BUDGET)
4986                __napi_schedule(napi);
4987        local_bh_enable();
4988        if (local_softirq_pending())
4989                do_softirq();
4990}
4991
4992bool sk_busy_loop(struct sock *sk, int nonblock)
4993{
4994        unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
4995        int (*napi_poll)(struct napi_struct *napi, int budget);
4996        int (*busy_poll)(struct napi_struct *dev);
4997        void *have_poll_lock = NULL;
4998        struct napi_struct *napi;
4999        int rc;
5000
5001restart:
5002        rc = false;
5003        napi_poll = NULL;
5004
5005        rcu_read_lock();
5006
5007        napi = napi_by_id(sk->sk_napi_id);
5008        if (!napi)
5009                goto out;
5010
5011        /* Note: ndo_busy_poll method is optional in linux-4.5 */
5012        busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
5013
5014        preempt_disable();
5015        for (;;) {
5016                rc = 0;
5017                local_bh_disable();
5018                if (busy_poll) {
5019                        rc = busy_poll(napi);
5020                        goto count;
5021                }
5022                if (!napi_poll) {
5023                        unsigned long val = READ_ONCE(napi->state);
5024
5025                        /* If multiple threads are competing for this napi,
5026                         * we avoid dirtying napi->state as much as we can.
5027                         */
5028                        if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
5029                                   NAPIF_STATE_IN_BUSY_POLL))
5030                                goto count;
5031                        if (cmpxchg(&napi->state, val,
5032                                    val | NAPIF_STATE_IN_BUSY_POLL |
5033                                          NAPIF_STATE_SCHED) != val)
5034                                goto count;
5035                        have_poll_lock = netpoll_poll_lock(napi);
5036                        napi_poll = napi->poll;
5037                }
5038                rc = napi_poll(napi, BUSY_POLL_BUDGET);
5039                trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
5040count:
5041                if (rc > 0)
5042                        __NET_ADD_STATS(sock_net(sk),
5043                                        LINUX_MIB_BUSYPOLLRXPACKETS, rc);
5044                local_bh_enable();
5045
5046                if (rc == LL_FLUSH_FAILED)
5047                        break; /* permanent failure */
5048
5049                if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
5050                    busy_loop_timeout(end_time))
5051                        break;
5052
5053                if (unlikely(need_resched())) {
5054                        if (napi_poll)
5055                                busy_poll_stop(napi, have_poll_lock);
5056                        preempt_enable();
5057                        rcu_read_unlock();
5058                        cond_resched();
5059                        rc = !skb_queue_empty(&sk->sk_receive_queue);
5060                        if (rc || busy_loop_timeout(end_time))
5061                                return rc;
5062                        goto restart;
5063                }
5064                cpu_relax();
5065        }
5066        if (napi_poll)
5067                busy_poll_stop(napi, have_poll_lock);
5068        preempt_enable();
5069        rc = !skb_queue_empty(&sk->sk_receive_queue);
5070out:
5071        rcu_read_unlock();
5072        return rc;
5073}
5074EXPORT_SYMBOL(sk_busy_loop);
5075
5076#endif /* CONFIG_NET_RX_BUSY_POLL */
5077
5078static void napi_hash_add(struct napi_struct *napi)
5079{
5080        if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5081            test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5082                return;
5083
5084        spin_lock(&napi_hash_lock);
5085
5086        /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5087        do {
5088                if (unlikely(++napi_gen_id < NR_CPUS + 1))
5089                        napi_gen_id = NR_CPUS + 1;
5090        } while (napi_by_id(napi_gen_id));
5091        napi->napi_id = napi_gen_id;
5092
5093        hlist_add_head_rcu(&napi->napi_hash_node,
5094                           &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5095
5096        spin_unlock(&napi_hash_lock);
5097}
5098
5099/* Warning : caller is responsible to make sure rcu grace period
5100 * is respected before freeing memory containing @napi
5101 */
5102bool napi_hash_del(struct napi_struct *napi)
5103{
5104        bool rcu_sync_needed = false;
5105
5106        spin_lock(&napi_hash_lock);
5107
5108        if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5109                rcu_sync_needed = true;
5110                hlist_del_rcu(&napi->napi_hash_node);
5111        }
5112        spin_unlock(&napi_hash_lock);
5113        return rcu_sync_needed;
5114}
5115EXPORT_SYMBOL_GPL(napi_hash_del);
5116
5117static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5118{
5119        struct napi_struct *napi;
5120
5121        napi = container_of(timer, struct napi_struct, timer);
5122        if (napi->gro_list)
5123                napi_schedule(napi);
5124
5125        return HRTIMER_NORESTART;
5126}
5127
5128void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5129                    int (*poll)(struct napi_struct *, int), int weight)
5130{
5131        INIT_LIST_HEAD(&napi->poll_list);
5132        hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5133        napi->timer.function = napi_watchdog;
5134        napi->gro_count = 0;
5135        napi->gro_list = NULL;
5136        napi->skb = NULL;
5137        napi->poll = poll;
5138        if (weight > NAPI_POLL_WEIGHT)
5139                pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5140                            weight, dev->name);
5141        napi->weight = weight;
5142        list_add(&napi->dev_list, &dev->napi_list);
5143        napi->dev = dev;
5144#ifdef CONFIG_NETPOLL
5145        napi->poll_owner = -1;
5146#endif
5147        set_bit(NAPI_STATE_SCHED, &napi->state);
5148        napi_hash_add(napi);
5149}
5150EXPORT_SYMBOL(netif_napi_add);
5151
5152void napi_disable(struct napi_struct *n)
5153{
5154        might_sleep();
5155        set_bit(NAPI_STATE_DISABLE, &n->state);
5156
5157        while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5158                msleep(1);
5159        while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5160                msleep(1);
5161
5162        hrtimer_cancel(&n->timer);
5163
5164        clear_bit(NAPI_STATE_DISABLE, &n->state);
5165}
5166EXPORT_SYMBOL(napi_disable);
5167
5168/* Must be called in process context */
5169void netif_napi_del(struct napi_struct *napi)
5170{
5171        might_sleep();
5172        if (napi_hash_del(napi))
5173                synchronize_net();
5174        list_del_init(&napi->dev_list);
5175        napi_free_frags(napi);
5176
5177        kfree_skb_list(napi->gro_list);
5178        napi->gro_list = NULL;
5179        napi->gro_count = 0;
5180}
5181EXPORT_SYMBOL(netif_napi_del);
5182
5183static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5184{
5185        void *have;
5186        int work, weight;
5187
5188        list_del_init(&n->poll_list);
5189
5190        have = netpoll_poll_lock(n);
5191
5192        weight = n->weight;
5193
5194        /* This NAPI_STATE_SCHED test is for avoiding a race
5195         * with netpoll's poll_napi().  Only the entity which
5196         * obtains the lock and sees NAPI_STATE_SCHED set will
5197         * actually make the ->poll() call.  Therefore we avoid
5198         * accidentally calling ->poll() when NAPI is not scheduled.
5199         */
5200        work = 0;
5201        if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5202                work = n->poll(n, weight);
5203                trace_napi_poll(n, work, weight);
5204        }
5205
5206        WARN_ON_ONCE(work > weight);
5207
5208        if (likely(work < weight))
5209                goto out_unlock;
5210
5211        /* Drivers must not modify the NAPI state if they
5212         * consume the entire weight.  In such cases this code
5213         * still "owns" the NAPI instance and therefore can
5214         * move the instance around on the list at-will.
5215         */
5216        if (unlikely(napi_disable_pending(n))) {
5217                napi_complete(n);
5218                goto out_unlock;
5219        }
5220
5221        if (n->gro_list) {
5222                /* flush too old packets
5223                 * If HZ < 1000, flush all packets.
5224                 */
5225                napi_gro_flush(n, HZ >= 1000);
5226        }
5227
5228        /* Some drivers may have called napi_schedule
5229         * prior to exhausting their budget.
5230         */
5231        if (unlikely(!list_empty(&n->poll_list))) {
5232                pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5233                             n->dev ? n->dev->name : "backlog");
5234                goto out_unlock;
5235        }
5236
5237        list_add_tail(&n->poll_list, repoll);
5238
5239out_unlock:
5240        netpoll_poll_unlock(have);
5241
5242        return work;
5243}
5244
5245static __latent_entropy void net_rx_action(struct softirq_action *h)
5246{
5247        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5248        unsigned long time_limit = jiffies + 2;
5249        int budget = netdev_budget;
5250        LIST_HEAD(list);
5251        LIST_HEAD(repoll);
5252
5253        local_irq_disable();
5254        list_splice_init(&sd->poll_list, &list);
5255        local_irq_enable();
5256
5257        for (;;) {
5258                struct napi_struct *n;
5259
5260                if (list_empty(&list)) {
5261                        if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5262                                goto out;
5263                        break;
5264                }
5265
5266                n = list_first_entry(&list, struct napi_struct, poll_list);
5267                budget -= napi_poll(n, &repoll);
5268
5269                /* If softirq window is exhausted then punt.
5270                 * Allow this to run for 2 jiffies since which will allow
5271                 * an average latency of 1.5/HZ.
5272                 */
5273                if (unlikely(budget <= 0 ||
5274                             time_after_eq(jiffies, time_limit))) {
5275                        sd->time_squeeze++;
5276                        break;
5277                }
5278        }
5279
5280        local_irq_disable();
5281
5282        list_splice_tail_init(&sd->poll_list, &list);
5283        list_splice_tail(&repoll, &list);
5284        list_splice(&list, &sd->poll_list);
5285        if (!list_empty(&sd->poll_list))
5286                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
5287
5288        net_rps_action_and_irq_enable(sd);
5289out:
5290        __kfree_skb_flush();
5291}
5292
5293struct netdev_adjacent {
5294        struct net_device *dev;
5295
5296        /* upper master flag, there can only be one master device per list */
5297        bool master;
5298
5299        /* counter for the number of times this device was added to us */
5300        u16 ref_nr;
5301
5302        /* private field for the users */
5303        void *private;
5304
5305        struct list_head list;
5306        struct rcu_head rcu;
5307};
5308
5309static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5310                                                 struct list_head *adj_list)
5311{
5312        struct netdev_adjacent *adj;
5313
5314        list_for_each_entry(adj, adj_list, list) {
5315                if (adj->dev == adj_dev)
5316                        return adj;
5317        }
5318        return NULL;
5319}
5320
5321static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
5322{
5323        struct net_device *dev = data;
5324
5325        return upper_dev == dev;
5326}
5327
5328/**
5329 * netdev_has_upper_dev - Check if device is linked to an upper device
5330 * @dev: device
5331 * @upper_dev: upper device to check
5332 *
5333 * Find out if a device is linked to specified upper device and return true
5334 * in case it is. Note that this checks only immediate upper device,
5335 * not through a complete stack of devices. The caller must hold the RTNL lock.
5336 */
5337bool netdev_has_upper_dev(struct net_device *dev,
5338                          struct net_device *upper_dev)
5339{
5340        ASSERT_RTNL();
5341
5342        return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5343                                             upper_dev);
5344}
5345EXPORT_SYMBOL(netdev_has_upper_dev);
5346
5347/**
5348 * netdev_has_upper_dev_all - Check if device is linked to an upper device
5349 * @dev: device
5350 * @upper_dev: upper device to check
5351 *
5352 * Find out if a device is linked to specified upper device and return true
5353 * in case it is. Note that this checks the entire upper device chain.
5354 * The caller must hold rcu lock.
5355 */
5356
5357bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
5358                                  struct net_device *upper_dev)
5359{
5360        return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5361                                               upper_dev);
5362}
5363EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
5364
5365/**
5366 * netdev_has_any_upper_dev - Check if device is linked to some device
5367 * @dev: device
5368 *
5369 * Find out if a device is linked to an upper device and return true in case
5370 * it is. The caller must hold the RTNL lock.
5371 */
5372static bool netdev_has_any_upper_dev(struct net_device *dev)
5373{
5374        ASSERT_RTNL();
5375
5376        return !list_empty(&dev->adj_list.upper);
5377}
5378
5379/**
5380 * netdev_master_upper_dev_get - Get master upper device
5381 * @dev: device
5382 *
5383 * Find a master upper device and return pointer to it or NULL in case
5384 * it's not there. The caller must hold the RTNL lock.
5385 */
5386struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5387{
5388        struct netdev_adjacent *upper;
5389
5390        ASSERT_RTNL();
5391
5392        if (list_empty(&dev->adj_list.upper))
5393                return NULL;
5394
5395        upper = list_first_entry(&dev->adj_list.upper,
5396                                 struct netdev_adjacent, list);
5397        if (likely(upper->master))
5398                return upper->dev;
5399        return NULL;
5400}
5401EXPORT_SYMBOL(netdev_master_upper_dev_get);
5402
5403/**
5404 * netdev_has_any_lower_dev - Check if device is linked to some device
5405 * @dev: device
5406 *
5407 * Find out if a device is linked to a lower device and return true in case
5408 * it is. The caller must hold the RTNL lock.
5409 */
5410static bool netdev_has_any_lower_dev(struct net_device *dev)
5411{
5412        ASSERT_RTNL();
5413
5414        return !list_empty(&dev->adj_list.lower);
5415}
5416
5417void *netdev_adjacent_get_private(struct list_head *adj_list)
5418{
5419        struct netdev_adjacent *adj;
5420
5421        adj = list_entry(adj_list, struct netdev_adjacent, list);
5422
5423        return adj->private;
5424}
5425EXPORT_SYMBOL(netdev_adjacent_get_private);
5426
5427/**
5428 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5429 * @dev: device
5430 * @iter: list_head ** of the current position
5431 *
5432 * Gets the next device from the dev's upper list, starting from iter
5433 * position. The caller must hold RCU read lock.
5434 */
5435struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5436                                                 struct list_head **iter)
5437{
5438        struct netdev_adjacent *upper;
5439
5440        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5441
5442        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5443
5444        if (&upper->list == &dev->adj_list.upper)
5445                return NULL;
5446
5447        *iter = &upper->list;
5448
5449        return upper->dev;
5450}
5451EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5452
5453static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
5454                                                    struct list_head **iter)
5455{
5456        struct netdev_adjacent *upper;
5457
5458        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5459
5460        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5461
5462        if (&upper->list == &dev->adj_list.upper)
5463                return NULL;
5464
5465        *iter = &upper->list;
5466
5467        return upper->dev;
5468}
5469
5470int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
5471                                  int (*fn)(struct net_device *dev,
5472                                            void *data),
5473                                  void *data)
5474{
5475        struct net_device *udev;
5476        struct list_head *iter;
5477        int ret;
5478
5479        for (iter = &dev->adj_list.upper,
5480             udev = netdev_next_upper_dev_rcu(dev, &iter);
5481             udev;
5482             udev = netdev_next_upper_dev_rcu(dev, &iter)) {
5483                /* first is the upper device itself */
5484                ret = fn(udev, data);
5485                if (ret)
5486                        return ret;
5487
5488                /* then look at all of its upper devices */
5489                ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
5490                if (ret)
5491                        return ret;
5492        }
5493
5494        return 0;
5495}
5496EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
5497
5498/**
5499 * netdev_lower_get_next_private - Get the next ->private from the
5500 *                                 lower neighbour list
5501 * @dev: device
5502 * @iter: list_head ** of the current position
5503 *
5504 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5505 * list, starting from iter position. The caller must hold either hold the
5506 * RTNL lock or its own locking that guarantees that the neighbour lower
5507 * list will remain unchanged.
5508 */
5509void *netdev_lower_get_next_private(struct net_device *dev,
5510                                    struct list_head **iter)
5511{
5512        struct netdev_adjacent *lower;
5513
5514        lower = list_entry(*iter, struct netdev_adjacent, list);
5515
5516        if (&lower->list == &dev->adj_list.lower)
5517                return NULL;
5518
5519        *iter = lower->list.next;
5520
5521        return lower->private;
5522}
5523EXPORT_SYMBOL(netdev_lower_get_next_private);
5524
5525/**
5526 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5527 *                                     lower neighbour list, RCU
5528 *                                     variant
5529 * @dev: device
5530 * @iter: list_head ** of the current position
5531 *
5532 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5533 * list, starting from iter position. The caller must hold RCU read lock.
5534 */
5535void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5536                                        struct list_head **iter)
5537{
5538        struct netdev_adjacent *lower;
5539
5540        WARN_ON_ONCE(!rcu_read_lock_held());
5541
5542        lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5543
5544        if (&lower->list == &dev->adj_list.lower)
5545                return NULL;
5546
5547        *iter = &lower->list;
5548
5549        return lower->private;
5550}
5551EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5552
5553/**
5554 * netdev_lower_get_next - Get the next device from the lower neighbour
5555 *                         list
5556 * @dev: device
5557 * @iter: list_head ** of the current position
5558 *
5559 * Gets the next netdev_adjacent from the dev's lower neighbour
5560 * list, starting from iter position. The caller must hold RTNL lock or
5561 * its own locking that guarantees that the neighbour lower
5562 * list will remain unchanged.
5563 */
5564void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5565{
5566        struct netdev_adjacent *lower;
5567
5568        lower = list_entry(*iter, struct netdev_adjacent, list);
5569
5570        if (&lower->list == &dev->adj_list.lower)
5571                return NULL;
5572
5573        *iter = lower->list.next;
5574
5575        return lower->dev;
5576}
5577EXPORT_SYMBOL(netdev_lower_get_next);
5578
5579static struct net_device *netdev_next_lower_dev(struct net_device *dev,
5580                                                struct list_head **iter)
5581{
5582        struct netdev_adjacent *lower;
5583
5584        lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5585
5586        if (&lower->list == &dev->adj_list.lower)
5587                return NULL;
5588
5589        *iter = &lower->list;
5590
5591        return lower->dev;
5592}
5593
5594int netdev_walk_all_lower_dev(struct net_device *dev,
5595                              int (*fn)(struct net_device *dev,
5596                                        void *data),
5597                              void *data)
5598{
5599        struct net_device *ldev;
5600        struct list_head *iter;
5601        int ret;
5602
5603        for (iter = &dev->adj_list.lower,
5604             ldev = netdev_next_lower_dev(dev, &iter);
5605             ldev;
5606             ldev = netdev_next_lower_dev(dev, &iter)) {
5607                /* first is the lower device itself */
5608                ret = fn(ldev, data);
5609                if (ret)
5610                        return ret;
5611
5612                /* then look at all of its lower devices */
5613                ret = netdev_walk_all_lower_dev(ldev, fn, data);
5614                if (ret)
5615                        return ret;
5616        }
5617
5618        return 0;
5619}
5620EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
5621
5622static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
5623                                                    struct list_head **iter)
5624{
5625        struct netdev_adjacent *lower;
5626
5627        lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5628        if (&lower->list == &dev->adj_list.lower)
5629                return NULL;
5630
5631        *iter = &lower->list;
5632
5633        return lower->dev;
5634}
5635
5636int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
5637                                  int (*fn)(struct net_device *dev,
5638                                            void *data),
5639                                  void *data)
5640{
5641        struct net_device *ldev;
5642        struct list_head *iter;
5643        int ret;
5644
5645        for (iter = &dev->adj_list.lower,
5646             ldev =