linux/net/core/dev.c
<<
>>
Prefs
   1/*
   2 *      NET3    Protocol independent device support routines.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 *      Derived from the non IP parts of dev.c 1.0.19
  10 *              Authors:        Ross Biro
  11 *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *      Additional Authors:
  15 *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *              David Hinds <dahinds@users.sourceforge.net>
  18 *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *              Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *      Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *                                      to 2 if register_netdev gets called
  25 *                                      before net_dev_init & also removed a
  26 *                                      few lines of code in the process.
  27 *              Alan Cox        :       device private ioctl copies fields back.
  28 *              Alan Cox        :       Transmit queue code does relevant
  29 *                                      stunts to keep the queue safe.
  30 *              Alan Cox        :       Fixed double lock.
  31 *              Alan Cox        :       Fixed promisc NULL pointer trap
  32 *              ????????        :       Support the full private ioctl range
  33 *              Alan Cox        :       Moved ioctl permission check into
  34 *                                      drivers
  35 *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36 *              Alan Cox        :       100 backlog just doesn't cut it when
  37 *                                      you start doing multicast video 8)
  38 *              Alan Cox        :       Rewrote net_bh and list manager.
  39 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40 *              Alan Cox        :       Took out transmit every packet pass
  41 *                                      Saved a few bytes in the ioctl handler
  42 *              Alan Cox        :       Network driver sets packet type before
  43 *                                      calling netif_rx. Saves a function
  44 *                                      call a packet.
  45 *              Alan Cox        :       Hashed net_bh()
  46 *              Richard Kooijman:       Timestamp fixes.
  47 *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48 *              Alan Cox        :       Device lock protection.
  49 *              Alan Cox        :       Fixed nasty side effect of device close
  50 *                                      changes.
  51 *              Rudi Cilibrasi  :       Pass the right thing to
  52 *                                      set_mac_address()
  53 *              Dave Miller     :       32bit quantity for the device lock to
  54 *                                      make it work out on a Sparc.
  55 *              Bjorn Ekwall    :       Added KERNELD hack.
  56 *              Alan Cox        :       Cleaned up the backlog initialise.
  57 *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58 *                                      1 device.
  59 *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60 *                                      is no device open function.
  61 *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62 *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63 *              Cyrus Durgin    :       Cleaned for KMOD
  64 *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65 *                                      A network device unload needs to purge
  66 *                                      the backlog queue.
  67 *      Paul Rusty Russell      :       SIOCSIFNAME
  68 *              Pekka Riikonen  :       Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *                                      indefinitely on dev->refcnt
  71 *              J Hadi Salim    :       - Backlog queue sampling
  72 *                                      - netif_rx() feedback
  73 */
  74
  75#include <linux/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
  84#include <linux/sched/mm.h>
  85#include <linux/mutex.h>
  86#include <linux/string.h>
  87#include <linux/mm.h>
  88#include <linux/socket.h>
  89#include <linux/sockios.h>
  90#include <linux/errno.h>
  91#include <linux/interrupt.h>
  92#include <linux/if_ether.h>
  93#include <linux/netdevice.h>
  94#include <linux/etherdevice.h>
  95#include <linux/ethtool.h>
  96#include <linux/notifier.h>
  97#include <linux/skbuff.h>
  98#include <linux/bpf.h>
  99#include <linux/bpf_trace.h>
 100#include <net/net_namespace.h>
 101#include <net/sock.h>
 102#include <net/busy_poll.h>
 103#include <linux/rtnetlink.h>
 104#include <linux/stat.h>
 105#include <net/dst.h>
 106#include <net/dst_metadata.h>
 107#include <net/pkt_sched.h>
 108#include <net/pkt_cls.h>
 109#include <net/checksum.h>
 110#include <net/xfrm.h>
 111#include <linux/highmem.h>
 112#include <linux/init.h>
 113#include <linux/module.h>
 114#include <linux/netpoll.h>
 115#include <linux/rcupdate.h>
 116#include <linux/delay.h>
 117#include <net/iw_handler.h>
 118#include <asm/current.h>
 119#include <linux/audit.h>
 120#include <linux/dmaengine.h>
 121#include <linux/err.h>
 122#include <linux/ctype.h>
 123#include <linux/if_arp.h>
 124#include <linux/if_vlan.h>
 125#include <linux/ip.h>
 126#include <net/ip.h>
 127#include <net/mpls.h>
 128#include <linux/ipv6.h>
 129#include <linux/in.h>
 130#include <linux/jhash.h>
 131#include <linux/random.h>
 132#include <trace/events/napi.h>
 133#include <trace/events/net.h>
 134#include <trace/events/skb.h>
 135#include <linux/pci.h>
 136#include <linux/inetdevice.h>
 137#include <linux/cpu_rmap.h>
 138#include <linux/static_key.h>
 139#include <linux/hashtable.h>
 140#include <linux/vmalloc.h>
 141#include <linux/if_macvlan.h>
 142#include <linux/errqueue.h>
 143#include <linux/hrtimer.h>
 144#include <linux/netfilter_ingress.h>
 145#include <linux/crash_dump.h>
 146#include <linux/sctp.h>
 147#include <net/udp_tunnel.h>
 148#include <linux/net_namespace.h>
 149
 150#include "net-sysfs.h"
 151
 152/* Instead of increasing this, you should create a hash table. */
 153#define MAX_GRO_SKBS 8
 154
 155/* This should be increased if a protocol with a bigger head is added. */
 156#define GRO_MAX_HEAD (MAX_HEADER + 128)
 157
 158static DEFINE_SPINLOCK(ptype_lock);
 159static DEFINE_SPINLOCK(offload_lock);
 160struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 161struct list_head ptype_all __read_mostly;       /* Taps */
 162static struct list_head offload_base __read_mostly;
 163
 164static int netif_rx_internal(struct sk_buff *skb);
 165static int call_netdevice_notifiers_info(unsigned long val,
 166                                         struct netdev_notifier_info *info);
 167static struct napi_struct *napi_by_id(unsigned int napi_id);
 168
 169/*
 170 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 171 * semaphore.
 172 *
 173 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 174 *
 175 * Writers must hold the rtnl semaphore while they loop through the
 176 * dev_base_head list, and hold dev_base_lock for writing when they do the
 177 * actual updates.  This allows pure readers to access the list even
 178 * while a writer is preparing to update it.
 179 *
 180 * To put it another way, dev_base_lock is held for writing only to
 181 * protect against pure readers; the rtnl semaphore provides the
 182 * protection against other writers.
 183 *
 184 * See, for example usages, register_netdevice() and
 185 * unregister_netdevice(), which must be called with the rtnl
 186 * semaphore held.
 187 */
 188DEFINE_RWLOCK(dev_base_lock);
 189EXPORT_SYMBOL(dev_base_lock);
 190
 191static DEFINE_MUTEX(ifalias_mutex);
 192
 193/* protects napi_hash addition/deletion and napi_gen_id */
 194static DEFINE_SPINLOCK(napi_hash_lock);
 195
 196static unsigned int napi_gen_id = NR_CPUS;
 197static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 198
 199static seqcount_t devnet_rename_seq;
 200
 201static inline void dev_base_seq_inc(struct net *net)
 202{
 203        while (++net->dev_base_seq == 0)
 204                ;
 205}
 206
 207static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 208{
 209        unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
 210
 211        return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 212}
 213
 214static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 215{
 216        return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 217}
 218
 219static inline void rps_lock(struct softnet_data *sd)
 220{
 221#ifdef CONFIG_RPS
 222        spin_lock(&sd->input_pkt_queue.lock);
 223#endif
 224}
 225
 226static inline void rps_unlock(struct softnet_data *sd)
 227{
 228#ifdef CONFIG_RPS
 229        spin_unlock(&sd->input_pkt_queue.lock);
 230#endif
 231}
 232
 233/* Device list insertion */
 234static void list_netdevice(struct net_device *dev)
 235{
 236        struct net *net = dev_net(dev);
 237
 238        ASSERT_RTNL();
 239
 240        write_lock_bh(&dev_base_lock);
 241        list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 242        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 243        hlist_add_head_rcu(&dev->index_hlist,
 244                           dev_index_hash(net, dev->ifindex));
 245        write_unlock_bh(&dev_base_lock);
 246
 247        dev_base_seq_inc(net);
 248}
 249
 250/* Device list removal
 251 * caller must respect a RCU grace period before freeing/reusing dev
 252 */
 253static void unlist_netdevice(struct net_device *dev)
 254{
 255        ASSERT_RTNL();
 256
 257        /* Unlink dev from the device chain */
 258        write_lock_bh(&dev_base_lock);
 259        list_del_rcu(&dev->dev_list);
 260        hlist_del_rcu(&dev->name_hlist);
 261        hlist_del_rcu(&dev->index_hlist);
 262        write_unlock_bh(&dev_base_lock);
 263
 264        dev_base_seq_inc(dev_net(dev));
 265}
 266
 267/*
 268 *      Our notifier list
 269 */
 270
 271static RAW_NOTIFIER_HEAD(netdev_chain);
 272
 273/*
 274 *      Device drivers call our routines to queue packets here. We empty the
 275 *      queue in the local softnet handler.
 276 */
 277
 278DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 279EXPORT_PER_CPU_SYMBOL(softnet_data);
 280
 281#ifdef CONFIG_LOCKDEP
 282/*
 283 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 284 * according to dev->type
 285 */
 286static const unsigned short netdev_lock_type[] = {
 287         ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 288         ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 289         ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 290         ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 291         ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 292         ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 293         ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 294         ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 295         ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 296         ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 297         ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 298         ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 299         ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 300         ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 301         ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 302
 303static const char *const netdev_lock_name[] = {
 304        "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 305        "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 306        "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 307        "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 308        "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 309        "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 310        "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 311        "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 312        "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 313        "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 314        "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 315        "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 316        "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 317        "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 318        "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 319
 320static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 321static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 322
 323static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 324{
 325        int i;
 326
 327        for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 328                if (netdev_lock_type[i] == dev_type)
 329                        return i;
 330        /* the last key is used by default */
 331        return ARRAY_SIZE(netdev_lock_type) - 1;
 332}
 333
 334static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 335                                                 unsigned short dev_type)
 336{
 337        int i;
 338
 339        i = netdev_lock_pos(dev_type);
 340        lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 341                                   netdev_lock_name[i]);
 342}
 343
 344static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 345{
 346        int i;
 347
 348        i = netdev_lock_pos(dev->type);
 349        lockdep_set_class_and_name(&dev->addr_list_lock,
 350                                   &netdev_addr_lock_key[i],
 351                                   netdev_lock_name[i]);
 352}
 353#else
 354static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 355                                                 unsigned short dev_type)
 356{
 357}
 358static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 359{
 360}
 361#endif
 362
 363/*******************************************************************************
 364 *
 365 *              Protocol management and registration routines
 366 *
 367 *******************************************************************************/
 368
 369
 370/*
 371 *      Add a protocol ID to the list. Now that the input handler is
 372 *      smarter we can dispense with all the messy stuff that used to be
 373 *      here.
 374 *
 375 *      BEWARE!!! Protocol handlers, mangling input packets,
 376 *      MUST BE last in hash buckets and checking protocol handlers
 377 *      MUST start from promiscuous ptype_all chain in net_bh.
 378 *      It is true now, do not change it.
 379 *      Explanation follows: if protocol handler, mangling packet, will
 380 *      be the first on list, it is not able to sense, that packet
 381 *      is cloned and should be copied-on-write, so that it will
 382 *      change it and subsequent readers will get broken packet.
 383 *                                                      --ANK (980803)
 384 */
 385
 386static inline struct list_head *ptype_head(const struct packet_type *pt)
 387{
 388        if (pt->type == htons(ETH_P_ALL))
 389                return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 390        else
 391                return pt->dev ? &pt->dev->ptype_specific :
 392                                 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 393}
 394
 395/**
 396 *      dev_add_pack - add packet handler
 397 *      @pt: packet type declaration
 398 *
 399 *      Add a protocol handler to the networking stack. The passed &packet_type
 400 *      is linked into kernel lists and may not be freed until it has been
 401 *      removed from the kernel lists.
 402 *
 403 *      This call does not sleep therefore it can not
 404 *      guarantee all CPU's that are in middle of receiving packets
 405 *      will see the new packet type (until the next received packet).
 406 */
 407
 408void dev_add_pack(struct packet_type *pt)
 409{
 410        struct list_head *head = ptype_head(pt);
 411
 412        spin_lock(&ptype_lock);
 413        list_add_rcu(&pt->list, head);
 414        spin_unlock(&ptype_lock);
 415}
 416EXPORT_SYMBOL(dev_add_pack);
 417
 418/**
 419 *      __dev_remove_pack        - remove packet handler
 420 *      @pt: packet type declaration
 421 *
 422 *      Remove a protocol handler that was previously added to the kernel
 423 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 424 *      from the kernel lists and can be freed or reused once this function
 425 *      returns.
 426 *
 427 *      The packet type might still be in use by receivers
 428 *      and must not be freed until after all the CPU's have gone
 429 *      through a quiescent state.
 430 */
 431void __dev_remove_pack(struct packet_type *pt)
 432{
 433        struct list_head *head = ptype_head(pt);
 434        struct packet_type *pt1;
 435
 436        spin_lock(&ptype_lock);
 437
 438        list_for_each_entry(pt1, head, list) {
 439                if (pt == pt1) {
 440                        list_del_rcu(&pt->list);
 441                        goto out;
 442                }
 443        }
 444
 445        pr_warn("dev_remove_pack: %p not found\n", pt);
 446out:
 447        spin_unlock(&ptype_lock);
 448}
 449EXPORT_SYMBOL(__dev_remove_pack);
 450
 451/**
 452 *      dev_remove_pack  - remove packet handler
 453 *      @pt: packet type declaration
 454 *
 455 *      Remove a protocol handler that was previously added to the kernel
 456 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 457 *      from the kernel lists and can be freed or reused once this function
 458 *      returns.
 459 *
 460 *      This call sleeps to guarantee that no CPU is looking at the packet
 461 *      type after return.
 462 */
 463void dev_remove_pack(struct packet_type *pt)
 464{
 465        __dev_remove_pack(pt);
 466
 467        synchronize_net();
 468}
 469EXPORT_SYMBOL(dev_remove_pack);
 470
 471
 472/**
 473 *      dev_add_offload - register offload handlers
 474 *      @po: protocol offload declaration
 475 *
 476 *      Add protocol offload handlers to the networking stack. The passed
 477 *      &proto_offload is linked into kernel lists and may not be freed until
 478 *      it has been removed from the kernel lists.
 479 *
 480 *      This call does not sleep therefore it can not
 481 *      guarantee all CPU's that are in middle of receiving packets
 482 *      will see the new offload handlers (until the next received packet).
 483 */
 484void dev_add_offload(struct packet_offload *po)
 485{
 486        struct packet_offload *elem;
 487
 488        spin_lock(&offload_lock);
 489        list_for_each_entry(elem, &offload_base, list) {
 490                if (po->priority < elem->priority)
 491                        break;
 492        }
 493        list_add_rcu(&po->list, elem->list.prev);
 494        spin_unlock(&offload_lock);
 495}
 496EXPORT_SYMBOL(dev_add_offload);
 497
 498/**
 499 *      __dev_remove_offload     - remove offload handler
 500 *      @po: packet offload declaration
 501 *
 502 *      Remove a protocol offload handler that was previously added to the
 503 *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 504 *      is removed from the kernel lists and can be freed or reused once this
 505 *      function returns.
 506 *
 507 *      The packet type might still be in use by receivers
 508 *      and must not be freed until after all the CPU's have gone
 509 *      through a quiescent state.
 510 */
 511static void __dev_remove_offload(struct packet_offload *po)
 512{
 513        struct list_head *head = &offload_base;
 514        struct packet_offload *po1;
 515
 516        spin_lock(&offload_lock);
 517
 518        list_for_each_entry(po1, head, list) {
 519                if (po == po1) {
 520                        list_del_rcu(&po->list);
 521                        goto out;
 522                }
 523        }
 524
 525        pr_warn("dev_remove_offload: %p not found\n", po);
 526out:
 527        spin_unlock(&offload_lock);
 528}
 529
 530/**
 531 *      dev_remove_offload       - remove packet offload handler
 532 *      @po: packet offload declaration
 533 *
 534 *      Remove a packet offload handler that was previously added to the kernel
 535 *      offload handlers by dev_add_offload(). The passed &offload_type is
 536 *      removed from the kernel lists and can be freed or reused once this
 537 *      function returns.
 538 *
 539 *      This call sleeps to guarantee that no CPU is looking at the packet
 540 *      type after return.
 541 */
 542void dev_remove_offload(struct packet_offload *po)
 543{
 544        __dev_remove_offload(po);
 545
 546        synchronize_net();
 547}
 548EXPORT_SYMBOL(dev_remove_offload);
 549
 550/******************************************************************************
 551 *
 552 *                    Device Boot-time Settings Routines
 553 *
 554 ******************************************************************************/
 555
 556/* Boot time configuration table */
 557static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 558
 559/**
 560 *      netdev_boot_setup_add   - add new setup entry
 561 *      @name: name of the device
 562 *      @map: configured settings for the device
 563 *
 564 *      Adds new setup entry to the dev_boot_setup list.  The function
 565 *      returns 0 on error and 1 on success.  This is a generic routine to
 566 *      all netdevices.
 567 */
 568static int netdev_boot_setup_add(char *name, struct ifmap *map)
 569{
 570        struct netdev_boot_setup *s;
 571        int i;
 572
 573        s = dev_boot_setup;
 574        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 575                if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 576                        memset(s[i].name, 0, sizeof(s[i].name));
 577                        strlcpy(s[i].name, name, IFNAMSIZ);
 578                        memcpy(&s[i].map, map, sizeof(s[i].map));
 579                        break;
 580                }
 581        }
 582
 583        return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 584}
 585
 586/**
 587 * netdev_boot_setup_check      - check boot time settings
 588 * @dev: the netdevice
 589 *
 590 * Check boot time settings for the device.
 591 * The found settings are set for the device to be used
 592 * later in the device probing.
 593 * Returns 0 if no settings found, 1 if they are.
 594 */
 595int netdev_boot_setup_check(struct net_device *dev)
 596{
 597        struct netdev_boot_setup *s = dev_boot_setup;
 598        int i;
 599
 600        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 601                if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 602                    !strcmp(dev->name, s[i].name)) {
 603                        dev->irq = s[i].map.irq;
 604                        dev->base_addr = s[i].map.base_addr;
 605                        dev->mem_start = s[i].map.mem_start;
 606                        dev->mem_end = s[i].map.mem_end;
 607                        return 1;
 608                }
 609        }
 610        return 0;
 611}
 612EXPORT_SYMBOL(netdev_boot_setup_check);
 613
 614
 615/**
 616 * netdev_boot_base     - get address from boot time settings
 617 * @prefix: prefix for network device
 618 * @unit: id for network device
 619 *
 620 * Check boot time settings for the base address of device.
 621 * The found settings are set for the device to be used
 622 * later in the device probing.
 623 * Returns 0 if no settings found.
 624 */
 625unsigned long netdev_boot_base(const char *prefix, int unit)
 626{
 627        const struct netdev_boot_setup *s = dev_boot_setup;
 628        char name[IFNAMSIZ];
 629        int i;
 630
 631        sprintf(name, "%s%d", prefix, unit);
 632
 633        /*
 634         * If device already registered then return base of 1
 635         * to indicate not to probe for this interface
 636         */
 637        if (__dev_get_by_name(&init_net, name))
 638                return 1;
 639
 640        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 641                if (!strcmp(name, s[i].name))
 642                        return s[i].map.base_addr;
 643        return 0;
 644}
 645
 646/*
 647 * Saves at boot time configured settings for any netdevice.
 648 */
 649int __init netdev_boot_setup(char *str)
 650{
 651        int ints[5];
 652        struct ifmap map;
 653
 654        str = get_options(str, ARRAY_SIZE(ints), ints);
 655        if (!str || !*str)
 656                return 0;
 657
 658        /* Save settings */
 659        memset(&map, 0, sizeof(map));
 660        if (ints[0] > 0)
 661                map.irq = ints[1];
 662        if (ints[0] > 1)
 663                map.base_addr = ints[2];
 664        if (ints[0] > 2)
 665                map.mem_start = ints[3];
 666        if (ints[0] > 3)
 667                map.mem_end = ints[4];
 668
 669        /* Add new entry to the list */
 670        return netdev_boot_setup_add(str, &map);
 671}
 672
 673__setup("netdev=", netdev_boot_setup);
 674
 675/*******************************************************************************
 676 *
 677 *                          Device Interface Subroutines
 678 *
 679 *******************************************************************************/
 680
 681/**
 682 *      dev_get_iflink  - get 'iflink' value of a interface
 683 *      @dev: targeted interface
 684 *
 685 *      Indicates the ifindex the interface is linked to.
 686 *      Physical interfaces have the same 'ifindex' and 'iflink' values.
 687 */
 688
 689int dev_get_iflink(const struct net_device *dev)
 690{
 691        if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 692                return dev->netdev_ops->ndo_get_iflink(dev);
 693
 694        return dev->ifindex;
 695}
 696EXPORT_SYMBOL(dev_get_iflink);
 697
 698/**
 699 *      dev_fill_metadata_dst - Retrieve tunnel egress information.
 700 *      @dev: targeted interface
 701 *      @skb: The packet.
 702 *
 703 *      For better visibility of tunnel traffic OVS needs to retrieve
 704 *      egress tunnel information for a packet. Following API allows
 705 *      user to get this info.
 706 */
 707int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 708{
 709        struct ip_tunnel_info *info;
 710
 711        if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 712                return -EINVAL;
 713
 714        info = skb_tunnel_info_unclone(skb);
 715        if (!info)
 716                return -ENOMEM;
 717        if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 718                return -EINVAL;
 719
 720        return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 721}
 722EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 723
 724/**
 725 *      __dev_get_by_name       - find a device by its name
 726 *      @net: the applicable net namespace
 727 *      @name: name to find
 728 *
 729 *      Find an interface by name. Must be called under RTNL semaphore
 730 *      or @dev_base_lock. If the name is found a pointer to the device
 731 *      is returned. If the name is not found then %NULL is returned. The
 732 *      reference counters are not incremented so the caller must be
 733 *      careful with locks.
 734 */
 735
 736struct net_device *__dev_get_by_name(struct net *net, const char *name)
 737{
 738        struct net_device *dev;
 739        struct hlist_head *head = dev_name_hash(net, name);
 740
 741        hlist_for_each_entry(dev, head, name_hlist)
 742                if (!strncmp(dev->name, name, IFNAMSIZ))
 743                        return dev;
 744
 745        return NULL;
 746}
 747EXPORT_SYMBOL(__dev_get_by_name);
 748
 749/**
 750 * dev_get_by_name_rcu  - find a device by its name
 751 * @net: the applicable net namespace
 752 * @name: name to find
 753 *
 754 * Find an interface by name.
 755 * If the name is found a pointer to the device is returned.
 756 * If the name is not found then %NULL is returned.
 757 * The reference counters are not incremented so the caller must be
 758 * careful with locks. The caller must hold RCU lock.
 759 */
 760
 761struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 762{
 763        struct net_device *dev;
 764        struct hlist_head *head = dev_name_hash(net, name);
 765
 766        hlist_for_each_entry_rcu(dev, head, name_hlist)
 767                if (!strncmp(dev->name, name, IFNAMSIZ))
 768                        return dev;
 769
 770        return NULL;
 771}
 772EXPORT_SYMBOL(dev_get_by_name_rcu);
 773
 774/**
 775 *      dev_get_by_name         - find a device by its name
 776 *      @net: the applicable net namespace
 777 *      @name: name to find
 778 *
 779 *      Find an interface by name. This can be called from any
 780 *      context and does its own locking. The returned handle has
 781 *      the usage count incremented and the caller must use dev_put() to
 782 *      release it when it is no longer needed. %NULL is returned if no
 783 *      matching device is found.
 784 */
 785
 786struct net_device *dev_get_by_name(struct net *net, const char *name)
 787{
 788        struct net_device *dev;
 789
 790        rcu_read_lock();
 791        dev = dev_get_by_name_rcu(net, name);
 792        if (dev)
 793                dev_hold(dev);
 794        rcu_read_unlock();
 795        return dev;
 796}
 797EXPORT_SYMBOL(dev_get_by_name);
 798
 799/**
 800 *      __dev_get_by_index - find a device by its ifindex
 801 *      @net: the applicable net namespace
 802 *      @ifindex: index of device
 803 *
 804 *      Search for an interface by index. Returns %NULL if the device
 805 *      is not found or a pointer to the device. The device has not
 806 *      had its reference counter increased so the caller must be careful
 807 *      about locking. The caller must hold either the RTNL semaphore
 808 *      or @dev_base_lock.
 809 */
 810
 811struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 812{
 813        struct net_device *dev;
 814        struct hlist_head *head = dev_index_hash(net, ifindex);
 815
 816        hlist_for_each_entry(dev, head, index_hlist)
 817                if (dev->ifindex == ifindex)
 818                        return dev;
 819
 820        return NULL;
 821}
 822EXPORT_SYMBOL(__dev_get_by_index);
 823
 824/**
 825 *      dev_get_by_index_rcu - find a device by its ifindex
 826 *      @net: the applicable net namespace
 827 *      @ifindex: index of device
 828 *
 829 *      Search for an interface by index. Returns %NULL if the device
 830 *      is not found or a pointer to the device. The device has not
 831 *      had its reference counter increased so the caller must be careful
 832 *      about locking. The caller must hold RCU lock.
 833 */
 834
 835struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 836{
 837        struct net_device *dev;
 838        struct hlist_head *head = dev_index_hash(net, ifindex);
 839
 840        hlist_for_each_entry_rcu(dev, head, index_hlist)
 841                if (dev->ifindex == ifindex)
 842                        return dev;
 843
 844        return NULL;
 845}
 846EXPORT_SYMBOL(dev_get_by_index_rcu);
 847
 848
 849/**
 850 *      dev_get_by_index - find a device by its ifindex
 851 *      @net: the applicable net namespace
 852 *      @ifindex: index of device
 853 *
 854 *      Search for an interface by index. Returns NULL if the device
 855 *      is not found or a pointer to the device. The device returned has
 856 *      had a reference added and the pointer is safe until the user calls
 857 *      dev_put to indicate they have finished with it.
 858 */
 859
 860struct net_device *dev_get_by_index(struct net *net, int ifindex)
 861{
 862        struct net_device *dev;
 863
 864        rcu_read_lock();
 865        dev = dev_get_by_index_rcu(net, ifindex);
 866        if (dev)
 867                dev_hold(dev);
 868        rcu_read_unlock();
 869        return dev;
 870}
 871EXPORT_SYMBOL(dev_get_by_index);
 872
 873/**
 874 *      dev_get_by_napi_id - find a device by napi_id
 875 *      @napi_id: ID of the NAPI struct
 876 *
 877 *      Search for an interface by NAPI ID. Returns %NULL if the device
 878 *      is not found or a pointer to the device. The device has not had
 879 *      its reference counter increased so the caller must be careful
 880 *      about locking. The caller must hold RCU lock.
 881 */
 882
 883struct net_device *dev_get_by_napi_id(unsigned int napi_id)
 884{
 885        struct napi_struct *napi;
 886
 887        WARN_ON_ONCE(!rcu_read_lock_held());
 888
 889        if (napi_id < MIN_NAPI_ID)
 890                return NULL;
 891
 892        napi = napi_by_id(napi_id);
 893
 894        return napi ? napi->dev : NULL;
 895}
 896EXPORT_SYMBOL(dev_get_by_napi_id);
 897
 898/**
 899 *      netdev_get_name - get a netdevice name, knowing its ifindex.
 900 *      @net: network namespace
 901 *      @name: a pointer to the buffer where the name will be stored.
 902 *      @ifindex: the ifindex of the interface to get the name from.
 903 *
 904 *      The use of raw_seqcount_begin() and cond_resched() before
 905 *      retrying is required as we want to give the writers a chance
 906 *      to complete when CONFIG_PREEMPT is not set.
 907 */
 908int netdev_get_name(struct net *net, char *name, int ifindex)
 909{
 910        struct net_device *dev;
 911        unsigned int seq;
 912
 913retry:
 914        seq = raw_seqcount_begin(&devnet_rename_seq);
 915        rcu_read_lock();
 916        dev = dev_get_by_index_rcu(net, ifindex);
 917        if (!dev) {
 918                rcu_read_unlock();
 919                return -ENODEV;
 920        }
 921
 922        strcpy(name, dev->name);
 923        rcu_read_unlock();
 924        if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 925                cond_resched();
 926                goto retry;
 927        }
 928
 929        return 0;
 930}
 931
 932/**
 933 *      dev_getbyhwaddr_rcu - find a device by its hardware address
 934 *      @net: the applicable net namespace
 935 *      @type: media type of device
 936 *      @ha: hardware address
 937 *
 938 *      Search for an interface by MAC address. Returns NULL if the device
 939 *      is not found or a pointer to the device.
 940 *      The caller must hold RCU or RTNL.
 941 *      The returned device has not had its ref count increased
 942 *      and the caller must therefore be careful about locking
 943 *
 944 */
 945
 946struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 947                                       const char *ha)
 948{
 949        struct net_device *dev;
 950
 951        for_each_netdev_rcu(net, dev)
 952                if (dev->type == type &&
 953                    !memcmp(dev->dev_addr, ha, dev->addr_len))
 954                        return dev;
 955
 956        return NULL;
 957}
 958EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 959
 960struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 961{
 962        struct net_device *dev;
 963
 964        ASSERT_RTNL();
 965        for_each_netdev(net, dev)
 966                if (dev->type == type)
 967                        return dev;
 968
 969        return NULL;
 970}
 971EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 972
 973struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 974{
 975        struct net_device *dev, *ret = NULL;
 976
 977        rcu_read_lock();
 978        for_each_netdev_rcu(net, dev)
 979                if (dev->type == type) {
 980                        dev_hold(dev);
 981                        ret = dev;
 982                        break;
 983                }
 984        rcu_read_unlock();
 985        return ret;
 986}
 987EXPORT_SYMBOL(dev_getfirstbyhwtype);
 988
 989/**
 990 *      __dev_get_by_flags - find any device with given flags
 991 *      @net: the applicable net namespace
 992 *      @if_flags: IFF_* values
 993 *      @mask: bitmask of bits in if_flags to check
 994 *
 995 *      Search for any interface with the given flags. Returns NULL if a device
 996 *      is not found or a pointer to the device. Must be called inside
 997 *      rtnl_lock(), and result refcount is unchanged.
 998 */
 999
1000struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
1001                                      unsigned short mask)
1002{
1003        struct net_device *dev, *ret;
1004
1005        ASSERT_RTNL();
1006
1007        ret = NULL;
1008        for_each_netdev(net, dev) {
1009                if (((dev->flags ^ if_flags) & mask) == 0) {
1010                        ret = dev;
1011                        break;
1012                }
1013        }
1014        return ret;
1015}
1016EXPORT_SYMBOL(__dev_get_by_flags);
1017
1018/**
1019 *      dev_valid_name - check if name is okay for network device
1020 *      @name: name string
1021 *
1022 *      Network device names need to be valid file names to
1023 *      to allow sysfs to work.  We also disallow any kind of
1024 *      whitespace.
1025 */
1026bool dev_valid_name(const char *name)
1027{
1028        if (*name == '\0')
1029                return false;
1030        if (strlen(name) >= IFNAMSIZ)
1031                return false;
1032        if (!strcmp(name, ".") || !strcmp(name, ".."))
1033                return false;
1034
1035        while (*name) {
1036                if (*name == '/' || *name == ':' || isspace(*name))
1037                        return false;
1038                name++;
1039        }
1040        return true;
1041}
1042EXPORT_SYMBOL(dev_valid_name);
1043
1044/**
1045 *      __dev_alloc_name - allocate a name for a device
1046 *      @net: network namespace to allocate the device name in
1047 *      @name: name format string
1048 *      @buf:  scratch buffer and result name string
1049 *
1050 *      Passed a format string - eg "lt%d" it will try and find a suitable
1051 *      id. It scans list of devices to build up a free map, then chooses
1052 *      the first empty slot. The caller must hold the dev_base or rtnl lock
1053 *      while allocating the name and adding the device in order to avoid
1054 *      duplicates.
1055 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1056 *      Returns the number of the unit assigned or a negative errno code.
1057 */
1058
1059static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1060{
1061        int i = 0;
1062        const char *p;
1063        const int max_netdevices = 8*PAGE_SIZE;
1064        unsigned long *inuse;
1065        struct net_device *d;
1066
1067        if (!dev_valid_name(name))
1068                return -EINVAL;
1069
1070        p = strchr(name, '%');
1071        if (p) {
1072                /*
1073                 * Verify the string as this thing may have come from
1074                 * the user.  There must be either one "%d" and no other "%"
1075                 * characters.
1076                 */
1077                if (p[1] != 'd' || strchr(p + 2, '%'))
1078                        return -EINVAL;
1079
1080                /* Use one page as a bit array of possible slots */
1081                inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1082                if (!inuse)
1083                        return -ENOMEM;
1084
1085                for_each_netdev(net, d) {
1086                        if (!sscanf(d->name, name, &i))
1087                                continue;
1088                        if (i < 0 || i >= max_netdevices)
1089                                continue;
1090
1091                        /*  avoid cases where sscanf is not exact inverse of printf */
1092                        snprintf(buf, IFNAMSIZ, name, i);
1093                        if (!strncmp(buf, d->name, IFNAMSIZ))
1094                                set_bit(i, inuse);
1095                }
1096
1097                i = find_first_zero_bit(inuse, max_netdevices);
1098                free_page((unsigned long) inuse);
1099        }
1100
1101        snprintf(buf, IFNAMSIZ, name, i);
1102        if (!__dev_get_by_name(net, buf))
1103                return i;
1104
1105        /* It is possible to run out of possible slots
1106         * when the name is long and there isn't enough space left
1107         * for the digits, or if all bits are used.
1108         */
1109        return -ENFILE;
1110}
1111
1112static int dev_alloc_name_ns(struct net *net,
1113                             struct net_device *dev,
1114                             const char *name)
1115{
1116        char buf[IFNAMSIZ];
1117        int ret;
1118
1119        BUG_ON(!net);
1120        ret = __dev_alloc_name(net, name, buf);
1121        if (ret >= 0)
1122                strlcpy(dev->name, buf, IFNAMSIZ);
1123        return ret;
1124}
1125
1126/**
1127 *      dev_alloc_name - allocate a name for a device
1128 *      @dev: device
1129 *      @name: name format string
1130 *
1131 *      Passed a format string - eg "lt%d" it will try and find a suitable
1132 *      id. It scans list of devices to build up a free map, then chooses
1133 *      the first empty slot. The caller must hold the dev_base or rtnl lock
1134 *      while allocating the name and adding the device in order to avoid
1135 *      duplicates.
1136 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1137 *      Returns the number of the unit assigned or a negative errno code.
1138 */
1139
1140int dev_alloc_name(struct net_device *dev, const char *name)
1141{
1142        return dev_alloc_name_ns(dev_net(dev), dev, name);
1143}
1144EXPORT_SYMBOL(dev_alloc_name);
1145
1146int dev_get_valid_name(struct net *net, struct net_device *dev,
1147                       const char *name)
1148{
1149        BUG_ON(!net);
1150
1151        if (!dev_valid_name(name))
1152                return -EINVAL;
1153
1154        if (strchr(name, '%'))
1155                return dev_alloc_name_ns(net, dev, name);
1156        else if (__dev_get_by_name(net, name))
1157                return -EEXIST;
1158        else if (dev->name != name)
1159                strlcpy(dev->name, name, IFNAMSIZ);
1160
1161        return 0;
1162}
1163EXPORT_SYMBOL(dev_get_valid_name);
1164
1165/**
1166 *      dev_change_name - change name of a device
1167 *      @dev: device
1168 *      @newname: name (or format string) must be at least IFNAMSIZ
1169 *
1170 *      Change name of a device, can pass format strings "eth%d".
1171 *      for wildcarding.
1172 */
1173int dev_change_name(struct net_device *dev, const char *newname)
1174{
1175        unsigned char old_assign_type;
1176        char oldname[IFNAMSIZ];
1177        int err = 0;
1178        int ret;
1179        struct net *net;
1180
1181        ASSERT_RTNL();
1182        BUG_ON(!dev_net(dev));
1183
1184        net = dev_net(dev);
1185        if (dev->flags & IFF_UP)
1186                return -EBUSY;
1187
1188        write_seqcount_begin(&devnet_rename_seq);
1189
1190        if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1191                write_seqcount_end(&devnet_rename_seq);
1192                return 0;
1193        }
1194
1195        memcpy(oldname, dev->name, IFNAMSIZ);
1196
1197        err = dev_get_valid_name(net, dev, newname);
1198        if (err < 0) {
1199                write_seqcount_end(&devnet_rename_seq);
1200                return err;
1201        }
1202
1203        if (oldname[0] && !strchr(oldname, '%'))
1204                netdev_info(dev, "renamed from %s\n", oldname);
1205
1206        old_assign_type = dev->name_assign_type;
1207        dev->name_assign_type = NET_NAME_RENAMED;
1208
1209rollback:
1210        ret = device_rename(&dev->dev, dev->name);
1211        if (ret) {
1212                memcpy(dev->name, oldname, IFNAMSIZ);
1213                dev->name_assign_type = old_assign_type;
1214                write_seqcount_end(&devnet_rename_seq);
1215                return ret;
1216        }
1217
1218        write_seqcount_end(&devnet_rename_seq);
1219
1220        netdev_adjacent_rename_links(dev, oldname);
1221
1222        write_lock_bh(&dev_base_lock);
1223        hlist_del_rcu(&dev->name_hlist);
1224        write_unlock_bh(&dev_base_lock);
1225
1226        synchronize_rcu();
1227
1228        write_lock_bh(&dev_base_lock);
1229        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1230        write_unlock_bh(&dev_base_lock);
1231
1232        ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1233        ret = notifier_to_errno(ret);
1234
1235        if (ret) {
1236                /* err >= 0 after dev_alloc_name() or stores the first errno */
1237                if (err >= 0) {
1238                        err = ret;
1239                        write_seqcount_begin(&devnet_rename_seq);
1240                        memcpy(dev->name, oldname, IFNAMSIZ);
1241                        memcpy(oldname, newname, IFNAMSIZ);
1242                        dev->name_assign_type = old_assign_type;
1243                        old_assign_type = NET_NAME_RENAMED;
1244                        goto rollback;
1245                } else {
1246                        pr_err("%s: name change rollback failed: %d\n",
1247                               dev->name, ret);
1248                }
1249        }
1250
1251        return err;
1252}
1253
1254/**
1255 *      dev_set_alias - change ifalias of a device
1256 *      @dev: device
1257 *      @alias: name up to IFALIASZ
1258 *      @len: limit of bytes to copy from info
1259 *
1260 *      Set ifalias for a device,
1261 */
1262int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1263{
1264        struct dev_ifalias *new_alias = NULL;
1265
1266        if (len >= IFALIASZ)
1267                return -EINVAL;
1268
1269        if (len) {
1270                new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
1271                if (!new_alias)
1272                        return -ENOMEM;
1273
1274                memcpy(new_alias->ifalias, alias, len);
1275                new_alias->ifalias[len] = 0;
1276        }
1277
1278        mutex_lock(&ifalias_mutex);
1279        rcu_swap_protected(dev->ifalias, new_alias,
1280                           mutex_is_locked(&ifalias_mutex));
1281        mutex_unlock(&ifalias_mutex);
1282
1283        if (new_alias)
1284                kfree_rcu(new_alias, rcuhead);
1285
1286        return len;
1287}
1288
1289/**
1290 *      dev_get_alias - get ifalias of a device
1291 *      @dev: device
1292 *      @name: buffer to store name of ifalias
1293 *      @len: size of buffer
1294 *
1295 *      get ifalias for a device.  Caller must make sure dev cannot go
1296 *      away,  e.g. rcu read lock or own a reference count to device.
1297 */
1298int dev_get_alias(const struct net_device *dev, char *name, size_t len)
1299{
1300        const struct dev_ifalias *alias;
1301        int ret = 0;
1302
1303        rcu_read_lock();
1304        alias = rcu_dereference(dev->ifalias);
1305        if (alias)
1306                ret = snprintf(name, len, "%s", alias->ifalias);
1307        rcu_read_unlock();
1308
1309        return ret;
1310}
1311
1312/**
1313 *      netdev_features_change - device changes features
1314 *      @dev: device to cause notification
1315 *
1316 *      Called to indicate a device has changed features.
1317 */
1318void netdev_features_change(struct net_device *dev)
1319{
1320        call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1321}
1322EXPORT_SYMBOL(netdev_features_change);
1323
1324/**
1325 *      netdev_state_change - device changes state
1326 *      @dev: device to cause notification
1327 *
1328 *      Called to indicate a device has changed state. This function calls
1329 *      the notifier chains for netdev_chain and sends a NEWLINK message
1330 *      to the routing socket.
1331 */
1332void netdev_state_change(struct net_device *dev)
1333{
1334        if (dev->flags & IFF_UP) {
1335                struct netdev_notifier_change_info change_info = {
1336                        .info.dev = dev,
1337                };
1338
1339                call_netdevice_notifiers_info(NETDEV_CHANGE,
1340                                              &change_info.info);
1341                rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1342        }
1343}
1344EXPORT_SYMBOL(netdev_state_change);
1345
1346/**
1347 * netdev_notify_peers - notify network peers about existence of @dev
1348 * @dev: network device
1349 *
1350 * Generate traffic such that interested network peers are aware of
1351 * @dev, such as by generating a gratuitous ARP. This may be used when
1352 * a device wants to inform the rest of the network about some sort of
1353 * reconfiguration such as a failover event or virtual machine
1354 * migration.
1355 */
1356void netdev_notify_peers(struct net_device *dev)
1357{
1358        rtnl_lock();
1359        call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1360        call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1361        rtnl_unlock();
1362}
1363EXPORT_SYMBOL(netdev_notify_peers);
1364
1365static int __dev_open(struct net_device *dev)
1366{
1367        const struct net_device_ops *ops = dev->netdev_ops;
1368        int ret;
1369
1370        ASSERT_RTNL();
1371
1372        if (!netif_device_present(dev))
1373                return -ENODEV;
1374
1375        /* Block netpoll from trying to do any rx path servicing.
1376         * If we don't do this there is a chance ndo_poll_controller
1377         * or ndo_poll may be running while we open the device
1378         */
1379        netpoll_poll_disable(dev);
1380
1381        ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1382        ret = notifier_to_errno(ret);
1383        if (ret)
1384                return ret;
1385
1386        set_bit(__LINK_STATE_START, &dev->state);
1387
1388        if (ops->ndo_validate_addr)
1389                ret = ops->ndo_validate_addr(dev);
1390
1391        if (!ret && ops->ndo_open)
1392                ret = ops->ndo_open(dev);
1393
1394        netpoll_poll_enable(dev);
1395
1396        if (ret)
1397                clear_bit(__LINK_STATE_START, &dev->state);
1398        else {
1399                dev->flags |= IFF_UP;
1400                dev_set_rx_mode(dev);
1401                dev_activate(dev);
1402                add_device_randomness(dev->dev_addr, dev->addr_len);
1403        }
1404
1405        return ret;
1406}
1407
1408/**
1409 *      dev_open        - prepare an interface for use.
1410 *      @dev:   device to open
1411 *
1412 *      Takes a device from down to up state. The device's private open
1413 *      function is invoked and then the multicast lists are loaded. Finally
1414 *      the device is moved into the up state and a %NETDEV_UP message is
1415 *      sent to the netdev notifier chain.
1416 *
1417 *      Calling this function on an active interface is a nop. On a failure
1418 *      a negative errno code is returned.
1419 */
1420int dev_open(struct net_device *dev)
1421{
1422        int ret;
1423
1424        if (dev->flags & IFF_UP)
1425                return 0;
1426
1427        ret = __dev_open(dev);
1428        if (ret < 0)
1429                return ret;
1430
1431        rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1432        call_netdevice_notifiers(NETDEV_UP, dev);
1433
1434        return ret;
1435}
1436EXPORT_SYMBOL(dev_open);
1437
1438static void __dev_close_many(struct list_head *head)
1439{
1440        struct net_device *dev;
1441
1442        ASSERT_RTNL();
1443        might_sleep();
1444
1445        list_for_each_entry(dev, head, close_list) {
1446                /* Temporarily disable netpoll until the interface is down */
1447                netpoll_poll_disable(dev);
1448
1449                call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1450
1451                clear_bit(__LINK_STATE_START, &dev->state);
1452
1453                /* Synchronize to scheduled poll. We cannot touch poll list, it
1454                 * can be even on different cpu. So just clear netif_running().
1455                 *
1456                 * dev->stop() will invoke napi_disable() on all of it's
1457                 * napi_struct instances on this device.
1458                 */
1459                smp_mb__after_atomic(); /* Commit netif_running(). */
1460        }
1461
1462        dev_deactivate_many(head);
1463
1464        list_for_each_entry(dev, head, close_list) {
1465                const struct net_device_ops *ops = dev->netdev_ops;
1466
1467                /*
1468                 *      Call the device specific close. This cannot fail.
1469                 *      Only if device is UP
1470                 *
1471                 *      We allow it to be called even after a DETACH hot-plug
1472                 *      event.
1473                 */
1474                if (ops->ndo_stop)
1475                        ops->ndo_stop(dev);
1476
1477                dev->flags &= ~IFF_UP;
1478                netpoll_poll_enable(dev);
1479        }
1480}
1481
1482static void __dev_close(struct net_device *dev)
1483{
1484        LIST_HEAD(single);
1485
1486        list_add(&dev->close_list, &single);
1487        __dev_close_many(&single);
1488        list_del(&single);
1489}
1490
1491void dev_close_many(struct list_head *head, bool unlink)
1492{
1493        struct net_device *dev, *tmp;
1494
1495        /* Remove the devices that don't need to be closed */
1496        list_for_each_entry_safe(dev, tmp, head, close_list)
1497                if (!(dev->flags & IFF_UP))
1498                        list_del_init(&dev->close_list);
1499
1500        __dev_close_many(head);
1501
1502        list_for_each_entry_safe(dev, tmp, head, close_list) {
1503                rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1504                call_netdevice_notifiers(NETDEV_DOWN, dev);
1505                if (unlink)
1506                        list_del_init(&dev->close_list);
1507        }
1508}
1509EXPORT_SYMBOL(dev_close_many);
1510
1511/**
1512 *      dev_close - shutdown an interface.
1513 *      @dev: device to shutdown
1514 *
1515 *      This function moves an active device into down state. A
1516 *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1517 *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1518 *      chain.
1519 */
1520void dev_close(struct net_device *dev)
1521{
1522        if (dev->flags & IFF_UP) {
1523                LIST_HEAD(single);
1524
1525                list_add(&dev->close_list, &single);
1526                dev_close_many(&single, true);
1527                list_del(&single);
1528        }
1529}
1530EXPORT_SYMBOL(dev_close);
1531
1532
1533/**
1534 *      dev_disable_lro - disable Large Receive Offload on a device
1535 *      @dev: device
1536 *
1537 *      Disable Large Receive Offload (LRO) on a net device.  Must be
1538 *      called under RTNL.  This is needed if received packets may be
1539 *      forwarded to another interface.
1540 */
1541void dev_disable_lro(struct net_device *dev)
1542{
1543        struct net_device *lower_dev;
1544        struct list_head *iter;
1545
1546        dev->wanted_features &= ~NETIF_F_LRO;
1547        netdev_update_features(dev);
1548
1549        if (unlikely(dev->features & NETIF_F_LRO))
1550                netdev_WARN(dev, "failed to disable LRO!\n");
1551
1552        netdev_for_each_lower_dev(dev, lower_dev, iter)
1553                dev_disable_lro(lower_dev);
1554}
1555EXPORT_SYMBOL(dev_disable_lro);
1556
1557static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1558                                   struct net_device *dev)
1559{
1560        struct netdev_notifier_info info = {
1561                .dev = dev,
1562        };
1563
1564        return nb->notifier_call(nb, val, &info);
1565}
1566
1567static int dev_boot_phase = 1;
1568
1569/**
1570 * register_netdevice_notifier - register a network notifier block
1571 * @nb: notifier
1572 *
1573 * Register a notifier to be called when network device events occur.
1574 * The notifier passed is linked into the kernel structures and must
1575 * not be reused until it has been unregistered. A negative errno code
1576 * is returned on a failure.
1577 *
1578 * When registered all registration and up events are replayed
1579 * to the new notifier to allow device to have a race free
1580 * view of the network device list.
1581 */
1582
1583int register_netdevice_notifier(struct notifier_block *nb)
1584{
1585        struct net_device *dev;
1586        struct net_device *last;
1587        struct net *net;
1588        int err;
1589
1590        rtnl_lock();
1591        err = raw_notifier_chain_register(&netdev_chain, nb);
1592        if (err)
1593                goto unlock;
1594        if (dev_boot_phase)
1595                goto unlock;
1596        for_each_net(net) {
1597                for_each_netdev(net, dev) {
1598                        err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1599                        err = notifier_to_errno(err);
1600                        if (err)
1601                                goto rollback;
1602
1603                        if (!(dev->flags & IFF_UP))
1604                                continue;
1605
1606                        call_netdevice_notifier(nb, NETDEV_UP, dev);
1607                }
1608        }
1609
1610unlock:
1611        rtnl_unlock();
1612        return err;
1613
1614rollback:
1615        last = dev;
1616        for_each_net(net) {
1617                for_each_netdev(net, dev) {
1618                        if (dev == last)
1619                                goto outroll;
1620
1621                        if (dev->flags & IFF_UP) {
1622                                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1623                                                        dev);
1624                                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1625                        }
1626                        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1627                }
1628        }
1629
1630outroll:
1631        raw_notifier_chain_unregister(&netdev_chain, nb);
1632        goto unlock;
1633}
1634EXPORT_SYMBOL(register_netdevice_notifier);
1635
1636/**
1637 * unregister_netdevice_notifier - unregister a network notifier block
1638 * @nb: notifier
1639 *
1640 * Unregister a notifier previously registered by
1641 * register_netdevice_notifier(). The notifier is unlinked into the
1642 * kernel structures and may then be reused. A negative errno code
1643 * is returned on a failure.
1644 *
1645 * After unregistering unregister and down device events are synthesized
1646 * for all devices on the device list to the removed notifier to remove
1647 * the need for special case cleanup code.
1648 */
1649
1650int unregister_netdevice_notifier(struct notifier_block *nb)
1651{
1652        struct net_device *dev;
1653        struct net *net;
1654        int err;
1655
1656        rtnl_lock();
1657        err = raw_notifier_chain_unregister(&netdev_chain, nb);
1658        if (err)
1659                goto unlock;
1660
1661        for_each_net(net) {
1662                for_each_netdev(net, dev) {
1663                        if (dev->flags & IFF_UP) {
1664                                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1665                                                        dev);
1666                                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1667                        }
1668                        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1669                }
1670        }
1671unlock:
1672        rtnl_unlock();
1673        return err;
1674}
1675EXPORT_SYMBOL(unregister_netdevice_notifier);
1676
1677/**
1678 *      call_netdevice_notifiers_info - call all network notifier blocks
1679 *      @val: value passed unmodified to notifier function
1680 *      @dev: net_device pointer passed unmodified to notifier function
1681 *      @info: notifier information data
1682 *
1683 *      Call all network notifier blocks.  Parameters and return value
1684 *      are as for raw_notifier_call_chain().
1685 */
1686
1687static int call_netdevice_notifiers_info(unsigned long val,
1688                                         struct netdev_notifier_info *info)
1689{
1690        ASSERT_RTNL();
1691        return raw_notifier_call_chain(&netdev_chain, val, info);
1692}
1693
1694/**
1695 *      call_netdevice_notifiers - call all network notifier blocks
1696 *      @val: value passed unmodified to notifier function
1697 *      @dev: net_device pointer passed unmodified to notifier function
1698 *
1699 *      Call all network notifier blocks.  Parameters and return value
1700 *      are as for raw_notifier_call_chain().
1701 */
1702
1703int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1704{
1705        struct netdev_notifier_info info = {
1706                .dev = dev,
1707        };
1708
1709        return call_netdevice_notifiers_info(val, &info);
1710}
1711EXPORT_SYMBOL(call_netdevice_notifiers);
1712
1713#ifdef CONFIG_NET_INGRESS
1714static struct static_key ingress_needed __read_mostly;
1715
1716void net_inc_ingress_queue(void)
1717{
1718        static_key_slow_inc(&ingress_needed);
1719}
1720EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1721
1722void net_dec_ingress_queue(void)
1723{
1724        static_key_slow_dec(&ingress_needed);
1725}
1726EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1727#endif
1728
1729#ifdef CONFIG_NET_EGRESS
1730static struct static_key egress_needed __read_mostly;
1731
1732void net_inc_egress_queue(void)
1733{
1734        static_key_slow_inc(&egress_needed);
1735}
1736EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1737
1738void net_dec_egress_queue(void)
1739{
1740        static_key_slow_dec(&egress_needed);
1741}
1742EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1743#endif
1744
1745static struct static_key netstamp_needed __read_mostly;
1746#ifdef HAVE_JUMP_LABEL
1747static atomic_t netstamp_needed_deferred;
1748static atomic_t netstamp_wanted;
1749static void netstamp_clear(struct work_struct *work)
1750{
1751        int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1752        int wanted;
1753
1754        wanted = atomic_add_return(deferred, &netstamp_wanted);
1755        if (wanted > 0)
1756                static_key_enable(&netstamp_needed);
1757        else
1758                static_key_disable(&netstamp_needed);
1759}
1760static DECLARE_WORK(netstamp_work, netstamp_clear);
1761#endif
1762
1763void net_enable_timestamp(void)
1764{
1765#ifdef HAVE_JUMP_LABEL
1766        int wanted;
1767
1768        while (1) {
1769                wanted = atomic_read(&netstamp_wanted);
1770                if (wanted <= 0)
1771                        break;
1772                if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1773                        return;
1774        }
1775        atomic_inc(&netstamp_needed_deferred);
1776        schedule_work(&netstamp_work);
1777#else
1778        static_key_slow_inc(&netstamp_needed);
1779#endif
1780}
1781EXPORT_SYMBOL(net_enable_timestamp);
1782
1783void net_disable_timestamp(void)
1784{
1785#ifdef HAVE_JUMP_LABEL
1786        int wanted;
1787
1788        while (1) {
1789                wanted = atomic_read(&netstamp_wanted);
1790                if (wanted <= 1)
1791                        break;
1792                if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1793                        return;
1794        }
1795        atomic_dec(&netstamp_needed_deferred);
1796        schedule_work(&netstamp_work);
1797#else
1798        static_key_slow_dec(&netstamp_needed);
1799#endif
1800}
1801EXPORT_SYMBOL(net_disable_timestamp);
1802
1803static inline void net_timestamp_set(struct sk_buff *skb)
1804{
1805        skb->tstamp = 0;
1806        if (static_key_false(&netstamp_needed))
1807                __net_timestamp(skb);
1808}
1809
1810#define net_timestamp_check(COND, SKB)                  \
1811        if (static_key_false(&netstamp_needed)) {               \
1812                if ((COND) && !(SKB)->tstamp)   \
1813                        __net_timestamp(SKB);           \
1814        }                                               \
1815
1816bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1817{
1818        unsigned int len;
1819
1820        if (!(dev->flags & IFF_UP))
1821                return false;
1822
1823        len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1824        if (skb->len <= len)
1825                return true;
1826
1827        /* if TSO is enabled, we don't care about the length as the packet
1828         * could be forwarded without being segmented before
1829         */
1830        if (skb_is_gso(skb))
1831                return true;
1832
1833        return false;
1834}
1835EXPORT_SYMBOL_GPL(is_skb_forwardable);
1836
1837int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1838{
1839        int ret = ____dev_forward_skb(dev, skb);
1840
1841        if (likely(!ret)) {
1842                skb->protocol = eth_type_trans(skb, dev);
1843                skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1844        }
1845
1846        return ret;
1847}
1848EXPORT_SYMBOL_GPL(__dev_forward_skb);
1849
1850/**
1851 * dev_forward_skb - loopback an skb to another netif
1852 *
1853 * @dev: destination network device
1854 * @skb: buffer to forward
1855 *
1856 * return values:
1857 *      NET_RX_SUCCESS  (no congestion)
1858 *      NET_RX_DROP     (packet was dropped, but freed)
1859 *
1860 * dev_forward_skb can be used for injecting an skb from the
1861 * start_xmit function of one device into the receive queue
1862 * of another device.
1863 *
1864 * The receiving device may be in another namespace, so
1865 * we have to clear all information in the skb that could
1866 * impact namespace isolation.
1867 */
1868int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1869{
1870        return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1871}
1872EXPORT_SYMBOL_GPL(dev_forward_skb);
1873
1874static inline int deliver_skb(struct sk_buff *skb,
1875                              struct packet_type *pt_prev,
1876                              struct net_device *orig_dev)
1877{
1878        if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
1879                return -ENOMEM;
1880        refcount_inc(&skb->users);
1881        return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1882}
1883
1884static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1885                                          struct packet_type **pt,
1886                                          struct net_device *orig_dev,
1887                                          __be16 type,
1888                                          struct list_head *ptype_list)
1889{
1890        struct packet_type *ptype, *pt_prev = *pt;
1891
1892        list_for_each_entry_rcu(ptype, ptype_list, list) {
1893                if (ptype->type != type)
1894                        continue;
1895                if (pt_prev)
1896                        deliver_skb(skb, pt_prev, orig_dev);
1897                pt_prev = ptype;
1898        }
1899        *pt = pt_prev;
1900}
1901
1902static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1903{
1904        if (!ptype->af_packet_priv || !skb->sk)
1905                return false;
1906
1907        if (ptype->id_match)
1908                return ptype->id_match(ptype, skb->sk);
1909        else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1910                return true;
1911
1912        return false;
1913}
1914
1915/*
1916 *      Support routine. Sends outgoing frames to any network
1917 *      taps currently in use.
1918 */
1919
1920void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1921{
1922        struct packet_type *ptype;
1923        struct sk_buff *skb2 = NULL;
1924        struct packet_type *pt_prev = NULL;
1925        struct list_head *ptype_list = &ptype_all;
1926
1927        rcu_read_lock();
1928again:
1929        list_for_each_entry_rcu(ptype, ptype_list, list) {
1930                /* Never send packets back to the socket
1931                 * they originated from - MvS (miquels@drinkel.ow.org)
1932                 */
1933                if (skb_loop_sk(ptype, skb))
1934                        continue;
1935
1936                if (pt_prev) {
1937                        deliver_skb(skb2, pt_prev, skb->dev);
1938                        pt_prev = ptype;
1939                        continue;
1940                }
1941
1942                /* need to clone skb, done only once */
1943                skb2 = skb_clone(skb, GFP_ATOMIC);
1944                if (!skb2)
1945                        goto out_unlock;
1946
1947                net_timestamp_set(skb2);
1948
1949                /* skb->nh should be correctly
1950                 * set by sender, so that the second statement is
1951                 * just protection against buggy protocols.
1952                 */
1953                skb_reset_mac_header(skb2);
1954
1955                if (skb_network_header(skb2) < skb2->data ||
1956                    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1957                        net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1958                                             ntohs(skb2->protocol),
1959                                             dev->name);
1960                        skb_reset_network_header(skb2);
1961                }
1962
1963                skb2->transport_header = skb2->network_header;
1964                skb2->pkt_type = PACKET_OUTGOING;
1965                pt_prev = ptype;
1966        }
1967
1968        if (ptype_list == &ptype_all) {
1969                ptype_list = &dev->ptype_all;
1970                goto again;
1971        }
1972out_unlock:
1973        if (pt_prev) {
1974                if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
1975                        pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1976                else
1977                        kfree_skb(skb2);
1978        }
1979        rcu_read_unlock();
1980}
1981EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1982
1983/**
1984 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1985 * @dev: Network device
1986 * @txq: number of queues available
1987 *
1988 * If real_num_tx_queues is changed the tc mappings may no longer be
1989 * valid. To resolve this verify the tc mapping remains valid and if
1990 * not NULL the mapping. With no priorities mapping to this
1991 * offset/count pair it will no longer be used. In the worst case TC0
1992 * is invalid nothing can be done so disable priority mappings. If is
1993 * expected that drivers will fix this mapping if they can before
1994 * calling netif_set_real_num_tx_queues.
1995 */
1996static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1997{
1998        int i;
1999        struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2000
2001        /* If TC0 is invalidated disable TC mapping */
2002        if (tc->offset + tc->count > txq) {
2003                pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2004                dev->num_tc = 0;
2005                return;
2006        }
2007
2008        /* Invalidated prio to tc mappings set to TC0 */
2009        for (i = 1; i < TC_BITMASK + 1; i++) {
2010                int q = netdev_get_prio_tc_map(dev, i);
2011
2012                tc = &dev->tc_to_txq[q];
2013                if (tc->offset + tc->count > txq) {
2014                        pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2015                                i, q);
2016                        netdev_set_prio_tc_map(dev, i, 0);
2017                }
2018        }
2019}
2020
2021int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2022{
2023        if (dev->num_tc) {
2024                struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2025                int i;
2026
2027                for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2028                        if ((txq - tc->offset) < tc->count)
2029                                return i;
2030                }
2031
2032                return -1;
2033        }
2034
2035        return 0;
2036}
2037EXPORT_SYMBOL(netdev_txq_to_tc);
2038
2039#ifdef CONFIG_XPS
2040static DEFINE_MUTEX(xps_map_mutex);
2041#define xmap_dereference(P)             \
2042        rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2043
2044static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2045                             int tci, u16 index)
2046{
2047        struct xps_map *map = NULL;
2048        int pos;
2049
2050        if (dev_maps)
2051                map = xmap_dereference(dev_maps->cpu_map[tci]);
2052        if (!map)
2053                return false;
2054
2055        for (pos = map->len; pos--;) {
2056                if (map->queues[pos] != index)
2057                        continue;
2058
2059                if (map->len > 1) {
2060                        map->queues[pos] = map->queues[--map->len];
2061                        break;
2062                }
2063
2064                RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
2065                kfree_rcu(map, rcu);
2066                return false;
2067        }
2068
2069        return true;
2070}
2071
2072static bool remove_xps_queue_cpu(struct net_device *dev,
2073                                 struct xps_dev_maps *dev_maps,
2074                                 int cpu, u16 offset, u16 count)
2075{
2076        int num_tc = dev->num_tc ? : 1;
2077        bool active = false;
2078        int tci;
2079
2080        for (tci = cpu * num_tc; num_tc--; tci++) {
2081                int i, j;
2082
2083                for (i = count, j = offset; i--; j++) {
2084                        if (!remove_xps_queue(dev_maps, cpu, j))
2085                                break;
2086                }
2087
2088                active |= i < 0;
2089        }
2090
2091        return active;
2092}
2093
2094static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2095                                   u16 count)
2096{
2097        struct xps_dev_maps *dev_maps;
2098        int cpu, i;
2099        bool active = false;
2100
2101        mutex_lock(&xps_map_mutex);
2102        dev_maps = xmap_dereference(dev->xps_maps);
2103
2104        if (!dev_maps)
2105                goto out_no_maps;
2106
2107        for_each_possible_cpu(cpu)
2108                active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
2109                                               offset, count);
2110
2111        if (!active) {
2112                RCU_INIT_POINTER(dev->xps_maps, NULL);
2113                kfree_rcu(dev_maps, rcu);
2114        }
2115
2116        for (i = offset + (count - 1); count--; i--)
2117                netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2118                                             NUMA_NO_NODE);
2119
2120out_no_maps:
2121        mutex_unlock(&xps_map_mutex);
2122}
2123
2124static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2125{
2126        netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2127}
2128
2129static struct xps_map *expand_xps_map(struct xps_map *map,
2130                                      int cpu, u16 index)
2131{
2132        struct xps_map *new_map;
2133        int alloc_len = XPS_MIN_MAP_ALLOC;
2134        int i, pos;
2135
2136        for (pos = 0; map && pos < map->len; pos++) {
2137                if (map->queues[pos] != index)
2138                        continue;
2139                return map;
2140        }
2141
2142        /* Need to add queue to this CPU's existing map */
2143        if (map) {
2144                if (pos < map->alloc_len)
2145                        return map;
2146
2147                alloc_len = map->alloc_len * 2;
2148        }
2149
2150        /* Need to allocate new map to store queue on this CPU's map */
2151        new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2152                               cpu_to_node(cpu));
2153        if (!new_map)
2154                return NULL;
2155
2156        for (i = 0; i < pos; i++)
2157                new_map->queues[i] = map->queues[i];
2158        new_map->alloc_len = alloc_len;
2159        new_map->len = pos;
2160
2161        return new_map;
2162}
2163
2164int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2165                        u16 index)
2166{
2167        struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2168        int i, cpu, tci, numa_node_id = -2;
2169        int maps_sz, num_tc = 1, tc = 0;
2170        struct xps_map *map, *new_map;
2171        bool active = false;
2172
2173        if (dev->num_tc) {
2174                num_tc = dev->num_tc;
2175                tc = netdev_txq_to_tc(dev, index);
2176                if (tc < 0)
2177                        return -EINVAL;
2178        }
2179
2180        maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
2181        if (maps_sz < L1_CACHE_BYTES)
2182                maps_sz = L1_CACHE_BYTES;
2183
2184        mutex_lock(&xps_map_mutex);
2185
2186        dev_maps = xmap_dereference(dev->xps_maps);
2187
2188        /* allocate memory for queue storage */
2189        for_each_cpu_and(cpu, cpu_online_mask, mask) {
2190                if (!new_dev_maps)
2191                        new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2192                if (!new_dev_maps) {
2193                        mutex_unlock(&xps_map_mutex);
2194                        return -ENOMEM;
2195                }
2196
2197                tci = cpu * num_tc + tc;
2198                map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
2199                                 NULL;
2200
2201                map = expand_xps_map(map, cpu, index);
2202                if (!map)
2203                        goto error;
2204
2205                RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2206        }
2207
2208        if (!new_dev_maps)
2209                goto out_no_new_maps;
2210
2211        for_each_possible_cpu(cpu) {
2212                /* copy maps belonging to foreign traffic classes */
2213                for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
2214                        /* fill in the new device map from the old device map */
2215                        map = xmap_dereference(dev_maps->cpu_map[tci]);
2216                        RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2217                }
2218
2219                /* We need to explicitly update tci as prevous loop
2220                 * could break out early if dev_maps is NULL.
2221                 */
2222                tci = cpu * num_tc + tc;
2223
2224                if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2225                        /* add queue to CPU maps */
2226                        int pos = 0;
2227
2228                        map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2229                        while ((pos < map->len) && (map->queues[pos] != index))
2230                                pos++;
2231
2232                        if (pos == map->len)
2233                                map->queues[map->len++] = index;
2234#ifdef CONFIG_NUMA
2235                        if (numa_node_id == -2)
2236                                numa_node_id = cpu_to_node(cpu);
2237                        else if (numa_node_id != cpu_to_node(cpu))
2238                                numa_node_id = -1;
2239#endif
2240                } else if (dev_maps) {
2241                        /* fill in the new device map from the old device map */
2242                        map = xmap_dereference(dev_maps->cpu_map[tci]);
2243                        RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2244                }
2245
2246                /* copy maps belonging to foreign traffic classes */
2247                for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2248                        /* fill in the new device map from the old device map */
2249                        map = xmap_dereference(dev_maps->cpu_map[tci]);
2250                        RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2251                }
2252        }
2253
2254        rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2255
2256        /* Cleanup old maps */
2257        if (!dev_maps)
2258                goto out_no_old_maps;
2259
2260        for_each_possible_cpu(cpu) {
2261                for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2262                        new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2263                        map = xmap_dereference(dev_maps->cpu_map[tci]);
2264                        if (map && map != new_map)
2265                                kfree_rcu(map, rcu);
2266                }
2267        }
2268
2269        kfree_rcu(dev_maps, rcu);
2270
2271out_no_old_maps:
2272        dev_maps = new_dev_maps;
2273        active = true;
2274
2275out_no_new_maps:
2276        /* update Tx queue numa node */
2277        netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2278                                     (numa_node_id >= 0) ? numa_node_id :
2279                                     NUMA_NO_NODE);
2280
2281        if (!dev_maps)
2282                goto out_no_maps;
2283
2284        /* removes queue from unused CPUs */
2285        for_each_possible_cpu(cpu) {
2286                for (i = tc, tci = cpu * num_tc; i--; tci++)
2287                        active |= remove_xps_queue(dev_maps, tci, index);
2288                if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
2289                        active |= remove_xps_queue(dev_maps, tci, index);
2290                for (i = num_tc - tc, tci++; --i; tci++)
2291                        active |= remove_xps_queue(dev_maps, tci, index);
2292        }
2293
2294        /* free map if not active */
2295        if (!active) {
2296                RCU_INIT_POINTER(dev->xps_maps, NULL);
2297                kfree_rcu(dev_maps, rcu);
2298        }
2299
2300out_no_maps:
2301        mutex_unlock(&xps_map_mutex);
2302
2303        return 0;
2304error:
2305        /* remove any maps that we added */
2306        for_each_possible_cpu(cpu) {
2307                for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2308                        new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2309                        map = dev_maps ?
2310                              xmap_dereference(dev_maps->cpu_map[tci]) :
2311                              NULL;
2312                        if (new_map && new_map != map)
2313                                kfree(new_map);
2314                }
2315        }
2316
2317        mutex_unlock(&xps_map_mutex);
2318
2319        kfree(new_dev_maps);
2320        return -ENOMEM;
2321}
2322EXPORT_SYMBOL(netif_set_xps_queue);
2323
2324#endif
2325void netdev_reset_tc(struct net_device *dev)
2326{
2327#ifdef CONFIG_XPS
2328        netif_reset_xps_queues_gt(dev, 0);
2329#endif
2330        dev->num_tc = 0;
2331        memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2332        memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2333}
2334EXPORT_SYMBOL(netdev_reset_tc);
2335
2336int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2337{
2338        if (tc >= dev->num_tc)
2339                return -EINVAL;
2340
2341#ifdef CONFIG_XPS
2342        netif_reset_xps_queues(dev, offset, count);
2343#endif
2344        dev->tc_to_txq[tc].count = count;
2345        dev->tc_to_txq[tc].offset = offset;
2346        return 0;
2347}
2348EXPORT_SYMBOL(netdev_set_tc_queue);
2349
2350int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2351{
2352        if (num_tc > TC_MAX_QUEUE)
2353                return -EINVAL;
2354
2355#ifdef CONFIG_XPS
2356        netif_reset_xps_queues_gt(dev, 0);
2357#endif
2358        dev->num_tc = num_tc;
2359        return 0;
2360}
2361EXPORT_SYMBOL(netdev_set_num_tc);
2362
2363/*
2364 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2365 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2366 */
2367int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2368{
2369        bool disabling;
2370        int rc;
2371
2372        disabling = txq < dev->real_num_tx_queues;
2373
2374        if (txq < 1 || txq > dev->num_tx_queues)
2375                return -EINVAL;
2376
2377        if (dev->reg_state == NETREG_REGISTERED ||
2378            dev->reg_state == NETREG_UNREGISTERING) {
2379                ASSERT_RTNL();
2380
2381                rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2382                                                  txq);
2383                if (rc)
2384                        return rc;
2385
2386                if (dev->num_tc)
2387                        netif_setup_tc(dev, txq);
2388
2389                dev->real_num_tx_queues = txq;
2390
2391                if (disabling) {
2392                        synchronize_net();
2393                        qdisc_reset_all_tx_gt(dev, txq);
2394#ifdef CONFIG_XPS
2395                        netif_reset_xps_queues_gt(dev, txq);
2396#endif
2397                }
2398        } else {
2399                dev->real_num_tx_queues = txq;
2400        }
2401
2402        return 0;
2403}
2404EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2405
2406#ifdef CONFIG_SYSFS
2407/**
2408 *      netif_set_real_num_rx_queues - set actual number of RX queues used
2409 *      @dev: Network device
2410 *      @rxq: Actual number of RX queues
2411 *
2412 *      This must be called either with the rtnl_lock held or before
2413 *      registration of the net device.  Returns 0 on success, or a
2414 *      negative error code.  If called before registration, it always
2415 *      succeeds.
2416 */
2417int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2418{
2419        int rc;
2420
2421        if (rxq < 1 || rxq > dev->num_rx_queues)
2422                return -EINVAL;
2423
2424        if (dev->reg_state == NETREG_REGISTERED) {
2425                ASSERT_RTNL();
2426
2427                rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2428                                                  rxq);
2429                if (rc)
2430                        return rc;
2431        }
2432
2433        dev->real_num_rx_queues = rxq;
2434        return 0;
2435}
2436EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2437#endif
2438
2439/**
2440 * netif_get_num_default_rss_queues - default number of RSS queues
2441 *
2442 * This routine should set an upper limit on the number of RSS queues
2443 * used by default by multiqueue devices.
2444 */
2445int netif_get_num_default_rss_queues(void)
2446{
2447        return is_kdump_kernel() ?
2448                1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2449}
2450EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2451
2452static void __netif_reschedule(struct Qdisc *q)
2453{
2454        struct softnet_data *sd;
2455        unsigned long flags;
2456
2457        local_irq_save(flags);
2458        sd = this_cpu_ptr(&softnet_data);
2459        q->next_sched = NULL;
2460        *sd->output_queue_tailp = q;
2461        sd->output_queue_tailp = &q->next_sched;
2462        raise_softirq_irqoff(NET_TX_SOFTIRQ);
2463        local_irq_restore(flags);
2464}
2465
2466void __netif_schedule(struct Qdisc *q)
2467{
2468        if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2469                __netif_reschedule(q);
2470}
2471EXPORT_SYMBOL(__netif_schedule);
2472
2473struct dev_kfree_skb_cb {
2474        enum skb_free_reason reason;
2475};
2476
2477static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2478{
2479        return (struct dev_kfree_skb_cb *)skb->cb;
2480}
2481
2482void netif_schedule_queue(struct netdev_queue *txq)
2483{
2484        rcu_read_lock();
2485        if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2486                struct Qdisc *q = rcu_dereference(txq->qdisc);
2487
2488                __netif_schedule(q);
2489        }
2490        rcu_read_unlock();
2491}
2492EXPORT_SYMBOL(netif_schedule_queue);
2493
2494void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2495{
2496        if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2497                struct Qdisc *q;
2498
2499                rcu_read_lock();
2500                q = rcu_dereference(dev_queue->qdisc);
2501                __netif_schedule(q);
2502                rcu_read_unlock();
2503        }
2504}
2505EXPORT_SYMBOL(netif_tx_wake_queue);
2506
2507void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2508{
2509        unsigned long flags;
2510
2511        if (unlikely(!skb))
2512                return;
2513
2514        if (likely(refcount_read(&skb->users) == 1)) {
2515                smp_rmb();
2516                refcount_set(&skb->users, 0);
2517        } else if (likely(!refcount_dec_and_test(&skb->users))) {
2518                return;
2519        }
2520        get_kfree_skb_cb(skb)->reason = reason;
2521        local_irq_save(flags);
2522        skb->next = __this_cpu_read(softnet_data.completion_queue);
2523        __this_cpu_write(softnet_data.completion_queue, skb);
2524        raise_softirq_irqoff(NET_TX_SOFTIRQ);
2525        local_irq_restore(flags);
2526}
2527EXPORT_SYMBOL(__dev_kfree_skb_irq);
2528
2529void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2530{
2531        if (in_irq() || irqs_disabled())
2532                __dev_kfree_skb_irq(skb, reason);
2533        else
2534                dev_kfree_skb(skb);
2535}
2536EXPORT_SYMBOL(__dev_kfree_skb_any);
2537
2538
2539/**
2540 * netif_device_detach - mark device as removed
2541 * @dev: network device
2542 *
2543 * Mark device as removed from system and therefore no longer available.
2544 */
2545void netif_device_detach(struct net_device *dev)
2546{
2547        if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2548            netif_running(dev)) {
2549                netif_tx_stop_all_queues(dev);
2550        }
2551}
2552EXPORT_SYMBOL(netif_device_detach);
2553
2554/**
2555 * netif_device_attach - mark device as attached
2556 * @dev: network device
2557 *
2558 * Mark device as attached from system and restart if needed.
2559 */
2560void netif_device_attach(struct net_device *dev)
2561{
2562        if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2563            netif_running(dev)) {
2564                netif_tx_wake_all_queues(dev);
2565                __netdev_watchdog_up(dev);
2566        }
2567}
2568EXPORT_SYMBOL(netif_device_attach);
2569
2570/*
2571 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2572 * to be used as a distribution range.
2573 */
2574u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2575                  unsigned int num_tx_queues)
2576{
2577        u32 hash;
2578        u16 qoffset = 0;
2579        u16 qcount = num_tx_queues;
2580
2581        if (skb_rx_queue_recorded(skb)) {
2582                hash = skb_get_rx_queue(skb);
2583                while (unlikely(hash >= num_tx_queues))
2584                        hash -= num_tx_queues;
2585                return hash;
2586        }
2587
2588        if (dev->num_tc) {
2589                u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2590
2591                qoffset = dev->tc_to_txq[tc].offset;
2592                qcount = dev->tc_to_txq[tc].count;
2593        }
2594
2595        return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2596}
2597EXPORT_SYMBOL(__skb_tx_hash);
2598
2599static void skb_warn_bad_offload(const struct sk_buff *skb)
2600{
2601        static const netdev_features_t null_features;
2602        struct net_device *dev = skb->dev;
2603        const char *name = "";
2604
2605        if (!net_ratelimit())
2606                return;
2607
2608        if (dev) {
2609                if (dev->dev.parent)
2610                        name = dev_driver_string(dev->dev.parent);
2611                else
2612                        name = netdev_name(dev);
2613        }
2614        WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2615             "gso_type=%d ip_summed=%d\n",
2616             name, dev ? &dev->features : &null_features,
2617             skb->sk ? &skb->sk->sk_route_caps : &null_features,
2618             skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2619             skb_shinfo(skb)->gso_type, skb->ip_summed);
2620}
2621
2622/*
2623 * Invalidate hardware checksum when packet is to be mangled, and
2624 * complete checksum manually on outgoing path.
2625 */
2626int skb_checksum_help(struct sk_buff *skb)
2627{
2628        __wsum csum;
2629        int ret = 0, offset;
2630
2631        if (skb->ip_summed == CHECKSUM_COMPLETE)
2632                goto out_set_summed;
2633
2634        if (unlikely(skb_shinfo(skb)->gso_size)) {
2635                skb_warn_bad_offload(skb);
2636                return -EINVAL;
2637        }
2638
2639        /* Before computing a checksum, we should make sure no frag could
2640         * be modified by an external entity : checksum could be wrong.
2641         */
2642        if (skb_has_shared_frag(skb)) {
2643                ret = __skb_linearize(skb);
2644                if (ret)
2645                        goto out;
2646        }
2647
2648        offset = skb_checksum_start_offset(skb);
2649        BUG_ON(offset >= skb_headlen(skb));
2650        csum = skb_checksum(skb, offset, skb->len - offset, 0);
2651
2652        offset += skb->csum_offset;
2653        BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2654
2655        if (skb_cloned(skb) &&
2656            !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2657                ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2658                if (ret)
2659                        goto out;
2660        }
2661
2662        *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2663out_set_summed:
2664        skb->ip_summed = CHECKSUM_NONE;
2665out:
2666        return ret;
2667}
2668EXPORT_SYMBOL(skb_checksum_help);
2669
2670int skb_crc32c_csum_help(struct sk_buff *skb)
2671{
2672        __le32 crc32c_csum;
2673        int ret = 0, offset, start;
2674
2675        if (skb->ip_summed != CHECKSUM_PARTIAL)
2676                goto out;
2677
2678        if (unlikely(skb_is_gso(skb)))
2679                goto out;
2680
2681        /* Before computing a checksum, we should make sure no frag could
2682         * be modified by an external entity : checksum could be wrong.
2683         */
2684        if (unlikely(skb_has_shared_frag(skb))) {
2685                ret = __skb_linearize(skb);
2686                if (ret)
2687                        goto out;
2688        }
2689        start = skb_checksum_start_offset(skb);
2690        offset = start + offsetof(struct sctphdr, checksum);
2691        if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
2692                ret = -EINVAL;
2693                goto out;
2694        }
2695        if (skb_cloned(skb) &&
2696            !skb_clone_writable(skb, offset + sizeof(__le32))) {
2697                ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2698                if (ret)
2699                        goto out;
2700        }
2701        crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
2702                                                  skb->len - start, ~(__u32)0,
2703                                                  crc32c_csum_stub));
2704        *(__le32 *)(skb->data + offset) = crc32c_csum;
2705        skb->ip_summed = CHECKSUM_NONE;
2706        skb->csum_not_inet = 0;
2707out:
2708        return ret;
2709}
2710
2711__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2712{
2713        __be16 type = skb->protocol;
2714
2715        /* Tunnel gso handlers can set protocol to ethernet. */
2716        if (type == htons(ETH_P_TEB)) {
2717                struct ethhdr *eth;
2718
2719                if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2720                        return 0;
2721
2722                eth = (struct ethhdr *)skb_mac_header(skb);
2723                type = eth->h_proto;
2724        }
2725
2726        return __vlan_get_protocol(skb, type, depth);
2727}
2728
2729/**
2730 *      skb_mac_gso_segment - mac layer segmentation handler.
2731 *      @skb: buffer to segment
2732 *      @features: features for the output path (see dev->features)
2733 */
2734struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2735                                    netdev_features_t features)
2736{
2737        struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2738        struct packet_offload *ptype;
2739        int vlan_depth = skb->mac_len;
2740        __be16 type = skb_network_protocol(skb, &vlan_depth);
2741
2742        if (unlikely(!type))
2743                return ERR_PTR(-EINVAL);
2744
2745        __skb_pull(skb, vlan_depth);
2746
2747        rcu_read_lock();
2748        list_for_each_entry_rcu(ptype, &offload_base, list) {
2749                if (ptype->type == type && ptype->callbacks.gso_segment) {
2750                        segs = ptype->callbacks.gso_segment(skb, features);
2751                        break;
2752                }
2753        }
2754        rcu_read_unlock();
2755
2756        __skb_push(skb, skb->data - skb_mac_header(skb));
2757
2758        return segs;
2759}
2760EXPORT_SYMBOL(skb_mac_gso_segment);
2761
2762
2763/* openvswitch calls this on rx path, so we need a different check.
2764 */
2765static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2766{
2767        if (tx_path)
2768                return skb->ip_summed != CHECKSUM_PARTIAL &&
2769                       skb->ip_summed != CHECKSUM_UNNECESSARY;
2770
2771        return skb->ip_summed == CHECKSUM_NONE;
2772}
2773
2774/**
2775 *      __skb_gso_segment - Perform segmentation on skb.
2776 *      @skb: buffer to segment
2777 *      @features: features for the output path (see dev->features)
2778 *      @tx_path: whether it is called in TX path
2779 *
2780 *      This function segments the given skb and returns a list of segments.
2781 *
2782 *      It may return NULL if the skb requires no segmentation.  This is
2783 *      only possible when GSO is used for verifying header integrity.
2784 *
2785 *      Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2786 */
2787struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2788                                  netdev_features_t features, bool tx_path)
2789{
2790        struct sk_buff *segs;
2791
2792        if (unlikely(skb_needs_check(skb, tx_path))) {
2793                int err;
2794
2795                /* We're going to init ->check field in TCP or UDP header */
2796                err = skb_cow_head(skb, 0);
2797                if (err < 0)
2798                        return ERR_PTR(err);
2799        }
2800
2801        /* Only report GSO partial support if it will enable us to
2802         * support segmentation on this frame without needing additional
2803         * work.
2804         */
2805        if (features & NETIF_F_GSO_PARTIAL) {
2806                netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2807                struct net_device *dev = skb->dev;
2808
2809                partial_features |= dev->features & dev->gso_partial_features;
2810                if (!skb_gso_ok(skb, features | partial_features))
2811                        features &= ~NETIF_F_GSO_PARTIAL;
2812        }
2813
2814        BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2815                     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2816
2817        SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2818        SKB_GSO_CB(skb)->encap_level = 0;
2819
2820        skb_reset_mac_header(skb);
2821        skb_reset_mac_len(skb);
2822
2823        segs = skb_mac_gso_segment(skb, features);
2824
2825        if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
2826                skb_warn_bad_offload(skb);
2827
2828        return segs;
2829}
2830EXPORT_SYMBOL(__skb_gso_segment);
2831
2832/* Take action when hardware reception checksum errors are detected. */
2833#ifdef CONFIG_BUG
2834void netdev_rx_csum_fault(struct net_device *dev)
2835{
2836        if (net_ratelimit()) {
2837                pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2838                dump_stack();
2839        }
2840}
2841EXPORT_SYMBOL(netdev_rx_csum_fault);
2842#endif
2843
2844/* Actually, we should eliminate this check as soon as we know, that:
2845 * 1. IOMMU is present and allows to map all the memory.
2846 * 2. No high memory really exists on this machine.
2847 */
2848
2849static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2850{
2851#ifdef CONFIG_HIGHMEM
2852        int i;
2853
2854        if (!(dev->features & NETIF_F_HIGHDMA)) {
2855                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2856                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2857
2858                        if (PageHighMem(skb_frag_page(frag)))
2859                                return 1;
2860                }
2861        }
2862
2863        if (PCI_DMA_BUS_IS_PHYS) {
2864                struct device *pdev = dev->dev.parent;
2865
2866                if (!pdev)
2867                        return 0;
2868                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2869                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2870                        dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2871
2872                        if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2873                                return 1;
2874                }
2875        }
2876#endif
2877        return 0;
2878}
2879
2880/* If MPLS offload request, verify we are testing hardware MPLS features
2881 * instead of standard features for the netdev.
2882 */
2883#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2884static netdev_features_t net_mpls_features(struct sk_buff *skb,
2885                                           netdev_features_t features,
2886                                           __be16 type)
2887{
2888        if (eth_p_mpls(type))
2889                features &= skb->dev->mpls_features;
2890
2891        return features;
2892}
2893#else
2894static netdev_features_t net_mpls_features(struct sk_buff *skb,
2895                                           netdev_features_t features,
2896                                           __be16 type)
2897{
2898        return features;
2899}
2900#endif
2901
2902static netdev_features_t harmonize_features(struct sk_buff *skb,
2903        netdev_features_t features)
2904{
2905        int tmp;
2906        __be16 type;
2907
2908        type = skb_network_protocol(skb, &tmp);
2909        features = net_mpls_features(skb, features, type);
2910
2911        if (skb->ip_summed != CHECKSUM_NONE &&
2912            !can_checksum_protocol(features, type)) {
2913                features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2914        }
2915        if (illegal_highdma(skb->dev, skb))
2916                features &= ~NETIF_F_SG;
2917
2918        return features;
2919}
2920
2921netdev_features_t passthru_features_check(struct sk_buff *skb,
2922                                          struct net_device *dev,
2923                                          netdev_features_t features)
2924{
2925        return features;
2926}
2927EXPORT_SYMBOL(passthru_features_check);
2928
2929static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2930                                             struct net_device *dev,
2931                                             netdev_features_t features)
2932{
2933        return vlan_features_check(skb, features);
2934}
2935
2936static netdev_features_t gso_features_check(const struct sk_buff *skb,
2937                                            struct net_device *dev,
2938                                            netdev_features_t features)
2939{
2940        u16 gso_segs = skb_shinfo(skb)->gso_segs;
2941
2942        if (gso_segs > dev->gso_max_segs)
2943                return features & ~NETIF_F_GSO_MASK;
2944
2945        /* Support for GSO partial features requires software
2946         * intervention before we can actually process the packets
2947         * so we need to strip support for any partial features now
2948         * and we can pull them back in after we have partially
2949         * segmented the frame.
2950         */
2951        if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2952                features &= ~dev->gso_partial_features;
2953
2954        /* Make sure to clear the IPv4 ID mangling feature if the
2955         * IPv4 header has the potential to be fragmented.
2956         */
2957        if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2958                struct iphdr *iph = skb->encapsulation ?
2959                                    inner_ip_hdr(skb) : ip_hdr(skb);
2960
2961                if (!(iph->frag_off & htons(IP_DF)))
2962                        features &= ~NETIF_F_TSO_MANGLEID;
2963        }
2964
2965        return features;
2966}
2967
2968netdev_features_t netif_skb_features(struct sk_buff *skb)
2969{
2970        struct net_device *dev = skb->dev;
2971        netdev_features_t features = dev->features;
2972
2973        if (skb_is_gso(skb))
2974                features = gso_features_check(skb, dev, features);
2975
2976        /* If encapsulation offload request, verify we are testing
2977         * hardware encapsulation features instead of standard
2978         * features for the netdev
2979         */
2980        if (skb->encapsulation)
2981                features &= dev->hw_enc_features;
2982
2983        if (skb_vlan_tagged(skb))
2984                features = netdev_intersect_features(features,
2985                                                     dev->vlan_features |
2986                                                     NETIF_F_HW_VLAN_CTAG_TX |
2987                                                     NETIF_F_HW_VLAN_STAG_TX);
2988
2989        if (dev->netdev_ops->ndo_features_check)
2990                features &= dev->netdev_ops->ndo_features_check(skb, dev,
2991                                                                features);
2992        else
2993                features &= dflt_features_check(skb, dev, features);
2994
2995        return harmonize_features(skb, features);
2996}
2997EXPORT_SYMBOL(netif_skb_features);
2998
2999static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3000                    struct netdev_queue *txq, bool more)
3001{
3002        unsigned int len;
3003        int rc;
3004
3005        if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
3006                dev_queue_xmit_nit(skb, dev);
3007
3008        len = skb->len;
3009        trace_net_dev_start_xmit(skb, dev);
3010        rc = netdev_start_xmit(skb, dev, txq, more);
3011        trace_net_dev_xmit(skb, rc, dev, len);
3012
3013        return rc;
3014}
3015
3016struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
3017                                    struct netdev_queue *txq, int *ret)
3018{
3019        struct sk_buff *skb = first;
3020        int rc = NETDEV_TX_OK;
3021
3022        while (skb) {
3023                struct sk_buff *next = skb->next;
3024
3025                skb->next = NULL;
3026                rc = xmit_one(skb, dev, txq, next != NULL);
3027                if (unlikely(!dev_xmit_complete(rc))) {
3028                        skb->next = next;
3029                        goto out;
3030                }
3031
3032                skb = next;
3033                if (netif_xmit_stopped(txq) && skb) {
3034                        rc = NETDEV_TX_BUSY;
3035                        break;
3036                }
3037        }
3038
3039out:
3040        *ret = rc;
3041        return skb;
3042}
3043
3044static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3045                                          netdev_features_t features)
3046{
3047        if (skb_vlan_tag_present(skb) &&
3048            !vlan_hw_offload_capable(features, skb->vlan_proto))
3049                skb = __vlan_hwaccel_push_inside(skb);
3050        return skb;
3051}
3052
3053int skb_csum_hwoffload_help(struct sk_buff *skb,
3054                            const netdev_features_t features)
3055{
3056        if (unlikely(skb->csum_not_inet))
3057                return !!(features & NETIF_F_SCTP_CRC) ? 0 :
3058                        skb_crc32c_csum_help(skb);
3059
3060        return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
3061}
3062EXPORT_SYMBOL(skb_csum_hwoffload_help);
3063
3064static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
3065{
3066        netdev_features_t features;
3067
3068        features = netif_skb_features(skb);
3069        skb = validate_xmit_vlan(skb, features);
3070        if (unlikely(!skb))
3071                goto out_null;
3072
3073        if (netif_needs_gso(skb, features)) {
3074                struct sk_buff *segs;
3075
3076                segs = skb_gso_segment(skb, features);
3077                if (IS_ERR(segs)) {
3078                        goto out_kfree_skb;
3079                } else if (segs) {
3080                        consume_skb(skb);
3081                        skb = segs;
3082                }
3083        } else {
3084                if (skb_needs_linearize(skb, features) &&
3085                    __skb_linearize(skb))
3086                        goto out_kfree_skb;
3087
3088                if (validate_xmit_xfrm(skb, features))
3089                        goto out_kfree_skb;
3090
3091                /* If packet is not checksummed and device does not
3092                 * support checksumming for this protocol, complete
3093                 * checksumming here.
3094                 */
3095                if (skb->ip_summed == CHECKSUM_PARTIAL) {
3096                        if (skb->encapsulation)
3097                                skb_set_inner_transport_header(skb,
3098                                                               skb_checksum_start_offset(skb));
3099                        else
3100                                skb_set_transport_header(skb,
3101                                                         skb_checksum_start_offset(skb));
3102                        if (skb_csum_hwoffload_help(skb, features))
3103                                goto out_kfree_skb;
3104                }
3105        }
3106
3107        return skb;
3108
3109out_kfree_skb:
3110        kfree_skb(skb);
3111out_null:
3112        atomic_long_inc(&dev->tx_dropped);
3113        return NULL;
3114}
3115
3116struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
3117{
3118        struct sk_buff *next, *head = NULL, *tail;
3119
3120        for (; skb != NULL; skb = next) {
3121                next = skb->next;
3122                skb->next = NULL;
3123
3124                /* in case skb wont be segmented, point to itself */
3125                skb->prev = skb;
3126
3127                skb = validate_xmit_skb(skb, dev);
3128                if (!skb)
3129                        continue;
3130
3131                if (!head)
3132                        head = skb;
3133                else
3134                        tail->next = skb;
3135                /* If skb was segmented, skb->prev points to
3136                 * the last segment. If not, it still contains skb.
3137                 */
3138                tail = skb->prev;
3139        }
3140        return head;
3141}
3142EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3143
3144static void qdisc_pkt_len_init(struct sk_buff *skb)
3145{
3146        const struct skb_shared_info *shinfo = skb_shinfo(skb);
3147
3148        qdisc_skb_cb(skb)->pkt_len = skb->len;
3149
3150        /* To get more precise estimation of bytes sent on wire,
3151         * we add to pkt_len the headers size of all segments
3152         */
3153        if (shinfo->gso_size)  {
3154                unsigned int hdr_len;
3155                u16 gso_segs = shinfo->gso_segs;
3156
3157                /* mac layer + network layer */
3158                hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3159
3160                /* + transport layer */
3161                if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3162                        const struct tcphdr *th;
3163                        struct tcphdr _tcphdr;
3164
3165                        th = skb_header_pointer(skb, skb_transport_offset(skb),
3166                                                sizeof(_tcphdr), &_tcphdr);
3167                        if (likely(th))
3168                                hdr_len += __tcp_hdrlen(th);
3169                } else {
3170                        struct udphdr _udphdr;
3171
3172                        if (skb_header_pointer(skb, skb_transport_offset(skb),
3173                                               sizeof(_udphdr), &_udphdr))
3174                                hdr_len += sizeof(struct udphdr);
3175                }
3176
3177                if (shinfo->gso_type & SKB_GSO_DODGY)
3178                        gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3179                                                shinfo->gso_size);
3180
3181                qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3182        }
3183}
3184
3185static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3186                                 struct net_device *dev,
3187                                 struct netdev_queue *txq)
3188{
3189        spinlock_t *root_lock = qdisc_lock(q);
3190        struct sk_buff *to_free = NULL;
3191        bool contended;
3192        int rc;
3193
3194        qdisc_calculate_pkt_len(skb, q);
3195        /*
3196         * Heuristic to force contended enqueues to serialize on a
3197         * separate lock before trying to get qdisc main lock.
3198         * This permits qdisc->running owner to get the lock more
3199         * often and dequeue packets faster.
3200         */
3201        contended = qdisc_is_running(q);
3202        if (unlikely(contended))
3203                spin_lock(&q->busylock);
3204
3205        spin_lock(root_lock);
3206        if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3207                __qdisc_drop(skb, &to_free);
3208                rc = NET_XMIT_DROP;
3209        } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3210                   qdisc_run_begin(q)) {
3211                /*
3212                 * This is a work-conserving queue; there are no old skbs
3213                 * waiting to be sent out; and the qdisc is not running -
3214                 * xmit the skb directly.
3215                 */
3216
3217                qdisc_bstats_update(q, skb);
3218
3219                if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3220                        if (unlikely(contended)) {
3221                                spin_unlock(&q->busylock);
3222                                contended = false;
3223                        }
3224                        __qdisc_run(q);
3225                } else
3226                        qdisc_run_end(q);
3227
3228                rc = NET_XMIT_SUCCESS;
3229        } else {
3230                rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3231                if (qdisc_run_begin(q)) {
3232                        if (unlikely(contended)) {
3233                                spin_unlock(&q->busylock);
3234                                contended = false;
3235                        }
3236                        __qdisc_run(q);
3237                }
3238        }
3239        spin_unlock(root_lock);
3240        if (unlikely(to_free))
3241                kfree_skb_list(to_free);
3242        if (unlikely(contended))
3243                spin_unlock(&q->busylock);
3244        return rc;
3245}
3246
3247#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3248static void skb_update_prio(struct sk_buff *skb)
3249{
3250        struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3251
3252        if (!skb->priority && skb->sk && map) {
3253                unsigned int prioidx =
3254                        sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3255
3256                if (prioidx < map->priomap_len)
3257                        skb->priority = map->priomap[prioidx];
3258        }
3259}
3260#else
3261#define skb_update_prio(skb)
3262#endif
3263
3264DEFINE_PER_CPU(int, xmit_recursion);
3265EXPORT_SYMBOL(xmit_recursion);
3266
3267/**
3268 *      dev_loopback_xmit - loop back @skb
3269 *      @net: network namespace this loopback is happening in
3270 *      @sk:  sk needed to be a netfilter okfn
3271 *      @skb: buffer to transmit
3272 */
3273int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3274{
3275        skb_reset_mac_header(skb);
3276        __skb_pull(skb, skb_network_offset(skb));
3277        skb->pkt_type = PACKET_LOOPBACK;
3278        skb->ip_summed = CHECKSUM_UNNECESSARY;
3279        WARN_ON(!skb_dst(skb));
3280        skb_dst_force(skb);
3281        netif_rx_ni(skb);
3282        return 0;
3283}
3284EXPORT_SYMBOL(dev_loopback_xmit);
3285
3286#ifdef CONFIG_NET_EGRESS
3287static struct sk_buff *
3288sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3289{
3290        struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
3291        struct tcf_result cl_res;
3292
3293        if (!miniq)
3294                return skb;
3295
3296        /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
3297        mini_qdisc_bstats_cpu_update(miniq, skb);
3298
3299        switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
3300        case TC_ACT_OK:
3301        case TC_ACT_RECLASSIFY:
3302                skb->tc_index = TC_H_MIN(cl_res.classid);
3303                break;
3304        case TC_ACT_SHOT:
3305                mini_qdisc_qstats_cpu_drop(miniq);
3306                *ret = NET_XMIT_DROP;
3307                kfree_skb(skb);
3308                return NULL;
3309        case TC_ACT_STOLEN:
3310        case TC_ACT_QUEUED:
3311        case TC_ACT_TRAP:
3312                *ret = NET_XMIT_SUCCESS;
3313                consume_skb(skb);
3314                return NULL;
3315        case TC_ACT_REDIRECT:
3316                /* No need to push/pop skb's mac_header here on egress! */
3317                skb_do_redirect(skb);
3318                *ret = NET_XMIT_SUCCESS;
3319                return NULL;
3320        default:
3321                break;
3322        }
3323
3324        return skb;
3325}
3326#endif /* CONFIG_NET_EGRESS */
3327
3328static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3329{
3330#ifdef CONFIG_XPS
3331        struct xps_dev_maps *dev_maps;
3332        struct xps_map *map;
3333        int queue_index = -1;
3334
3335        rcu_read_lock();
3336        dev_maps = rcu_dereference(dev->xps_maps);
3337        if (dev_maps) {
3338                unsigned int tci = skb->sender_cpu - 1;
3339
3340                if (dev->num_tc) {
3341                        tci *= dev->num_tc;
3342                        tci += netdev_get_prio_tc_map(dev, skb->priority);
3343                }
3344
3345                map = rcu_dereference(dev_maps->cpu_map[tci]);
3346                if (map) {
3347                        if (map->len == 1)
3348                                queue_index = map->queues[0];
3349                        else
3350                                queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3351                                                                           map->len)];
3352                        if (unlikely(queue_index >= dev->real_num_tx_queues))
3353                                queue_index = -1;
3354                }
3355        }
3356        rcu_read_unlock();
3357
3358        return queue_index;
3359#else
3360        return -1;
3361#endif
3362}
3363
3364static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3365{
3366        struct sock *sk = skb->sk;
3367        int queue_index = sk_tx_queue_get(sk);
3368
3369        if (queue_index < 0 || skb->ooo_okay ||
3370            queue_index >= dev->real_num_tx_queues) {
3371                int new_index = get_xps_queue(dev, skb);
3372
3373                if (new_index < 0)
3374                        new_index = skb_tx_hash(dev, skb);
3375
3376                if (queue_index != new_index && sk &&
3377                    sk_fullsock(sk) &&
3378                    rcu_access_pointer(sk->sk_dst_cache))
3379                        sk_tx_queue_set(sk, new_index);
3380
3381                queue_index = new_index;
3382        }
3383
3384        return queue_index;
3385}
3386
3387struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3388                                    struct sk_buff *skb,
3389                                    void *accel_priv)
3390{
3391        int queue_index = 0;
3392
3393#ifdef CONFIG_XPS
3394        u32 sender_cpu = skb->sender_cpu - 1;
3395
3396        if (sender_cpu >= (u32)NR_CPUS)
3397                skb->sender_cpu = raw_smp_processor_id() + 1;
3398#endif
3399
3400        if (dev->real_num_tx_queues != 1) {
3401                const struct net_device_ops *ops = dev->netdev_ops;
3402
3403                if (ops->ndo_select_queue)
3404                        queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3405                                                            __netdev_pick_tx);
3406                else
3407                        queue_index = __netdev_pick_tx(dev, skb);
3408
3409                if (!accel_priv)
3410                        queue_index = netdev_cap_txqueue(dev, queue_index);
3411        }
3412
3413        skb_set_queue_mapping(skb, queue_index);
3414        return netdev_get_tx_queue(dev, queue_index);
3415}
3416
3417/**
3418 *      __dev_queue_xmit - transmit a buffer
3419 *      @skb: buffer to transmit
3420 *      @accel_priv: private data used for L2 forwarding offload
3421 *
3422 *      Queue a buffer for transmission to a network device. The caller must
3423 *      have set the device and priority and built the buffer before calling
3424 *      this function. The function can be called from an interrupt.
3425 *
3426 *      A negative errno code is returned on a failure. A success does not
3427 *      guarantee the frame will be transmitted as it may be dropped due
3428 *      to congestion or traffic shaping.
3429 *
3430 * -----------------------------------------------------------------------------------
3431 *      I notice this method can also return errors from the queue disciplines,
3432 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3433 *      be positive.
3434 *
3435 *      Regardless of the return value, the skb is consumed, so it is currently
3436 *      difficult to retry a send to this method.  (You can bump the ref count
3437 *      before sending to hold a reference for retry if you are careful.)
3438 *
3439 *      When calling this method, interrupts MUST be enabled.  This is because
3440 *      the BH enable code must have IRQs enabled so that it will not deadlock.
3441 *          --BLG
3442 */
3443static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3444{
3445        struct net_device *dev = skb->dev;
3446        struct netdev_queue *txq;
3447        struct Qdisc *q;
3448        int rc = -ENOMEM;
3449
3450        skb_reset_mac_header(skb);
3451
3452        if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3453                __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3454
3455        /* Disable soft irqs for various locks below. Also
3456         * stops preemption for RCU.
3457         */
3458        rcu_read_lock_bh();
3459
3460        skb_update_prio(skb);
3461
3462        qdisc_pkt_len_init(skb);
3463#ifdef CONFIG_NET_CLS_ACT
3464        skb->tc_at_ingress = 0;
3465# ifdef CONFIG_NET_EGRESS
3466        if (static_key_false(&egress_needed)) {
3467                skb = sch_handle_egress(skb, &rc, dev);
3468                if (!skb)
3469                        goto out;
3470        }
3471# endif
3472#endif
3473        /* If device/qdisc don't need skb->dst, release it right now while
3474         * its hot in this cpu cache.
3475         */
3476        if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3477                skb_dst_drop(skb);
3478        else
3479                skb_dst_force(skb);
3480
3481        txq = netdev_pick_tx(dev, skb, accel_priv);
3482        q = rcu_dereference_bh(txq->qdisc);
3483
3484        trace_net_dev_queue(skb);
3485        if (q->enqueue) {
3486                rc = __dev_xmit_skb(skb, q, dev, txq);
3487                goto out;
3488        }
3489
3490        /* The device has no queue. Common case for software devices:
3491         * loopback, all the sorts of tunnels...
3492
3493         * Really, it is unlikely that netif_tx_lock protection is necessary
3494         * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3495         * counters.)
3496         * However, it is possible, that they rely on protection
3497         * made by us here.
3498
3499         * Check this and shot the lock. It is not prone from deadlocks.
3500         *Either shot noqueue qdisc, it is even simpler 8)
3501         */
3502        if (dev->flags & IFF_UP) {
3503                int cpu = smp_processor_id(); /* ok because BHs are off */
3504
3505                if (txq->xmit_lock_owner != cpu) {
3506                        if (unlikely(__this_cpu_read(xmit_recursion) >
3507                                     XMIT_RECURSION_LIMIT))
3508                                goto recursion_alert;
3509
3510                        skb = validate_xmit_skb(skb, dev);
3511                        if (!skb)
3512                                goto out;
3513
3514                        HARD_TX_LOCK(dev, txq, cpu);
3515
3516                        if (!netif_xmit_stopped(txq)) {
3517                                __this_cpu_inc(xmit_recursion);
3518                                skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3519                                __this_cpu_dec(xmit_recursion);
3520                                if (dev_xmit_complete(rc)) {
3521                                        HARD_TX_UNLOCK(dev, txq);
3522                                        goto out;
3523                                }
3524                        }
3525                        HARD_TX_UNLOCK(dev, txq);
3526                        net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3527                                             dev->name);
3528                } else {
3529                        /* Recursion is detected! It is possible,
3530                         * unfortunately
3531                         */
3532recursion_alert:
3533                        net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3534                                             dev->name);
3535                }
3536        }
3537
3538        rc = -ENETDOWN;
3539        rcu_read_unlock_bh();
3540
3541        atomic_long_inc(&dev->tx_dropped);
3542        kfree_skb_list(skb);
3543        return rc;
3544out:
3545        rcu_read_unlock_bh();
3546        return rc;
3547}
3548
3549int dev_queue_xmit(struct sk_buff *skb)
3550{
3551        return __dev_queue_xmit(skb, NULL);
3552}
3553EXPORT_SYMBOL(dev_queue_xmit);
3554
3555int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3556{
3557        return __dev_queue_xmit(skb, accel_priv);
3558}
3559EXPORT_SYMBOL(dev_queue_xmit_accel);
3560
3561
3562/*************************************************************************
3563 *                      Receiver routines
3564 *************************************************************************/
3565
3566int netdev_max_backlog __read_mostly = 1000;
3567EXPORT_SYMBOL(netdev_max_backlog);
3568
3569int netdev_tstamp_prequeue __read_mostly = 1;
3570int netdev_budget __read_mostly = 300;
3571unsigned int __read_mostly netdev_budget_usecs = 2000;
3572int weight_p __read_mostly = 64;           /* old backlog weight */
3573int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
3574int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
3575int dev_rx_weight __read_mostly = 64;
3576int dev_tx_weight __read_mostly = 64;
3577
3578/* Called with irq disabled */
3579static inline void ____napi_schedule(struct softnet_data *sd,
3580                                     struct napi_struct *napi)
3581{
3582        list_add_tail(&napi->poll_list, &sd->poll_list);
3583        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3584}
3585
3586#ifdef CONFIG_RPS
3587
3588/* One global table that all flow-based protocols share. */
3589struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3590EXPORT_SYMBOL(rps_sock_flow_table);
3591u32 rps_cpu_mask __read_mostly;
3592EXPORT_SYMBOL(rps_cpu_mask);
3593
3594struct static_key rps_needed __read_mostly;
3595EXPORT_SYMBOL(rps_needed);
3596struct static_key rfs_needed __read_mostly;
3597EXPORT_SYMBOL(rfs_needed);
3598
3599static struct rps_dev_flow *
3600set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3601            struct rps_dev_flow *rflow, u16 next_cpu)
3602{
3603        if (next_cpu < nr_cpu_ids) {
3604#ifdef CONFIG_RFS_ACCEL
3605                struct netdev_rx_queue *rxqueue;
3606                struct rps_dev_flow_table *flow_table;
3607                struct rps_dev_flow *old_rflow;
3608                u32 flow_id;
3609                u16 rxq_index;
3610                int rc;
3611
3612                /* Should we steer this flow to a different hardware queue? */
3613                if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3614                    !(dev->features & NETIF_F_NTUPLE))
3615                        goto out;
3616                rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3617                if (rxq_index == skb_get_rx_queue(skb))
3618                        goto out;
3619
3620                rxqueue = dev->_rx + rxq_index;
3621                flow_table = rcu_dereference(rxqueue->rps_flow_table);
3622                if (!flow_table)
3623                        goto out;
3624                flow_id = skb_get_hash(skb) & flow_table->mask;
3625                rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3626                                                        rxq_index, flow_id);
3627                if (rc < 0)
3628                        goto out;
3629                old_rflow = rflow;
3630                rflow = &flow_table->flows[flow_id];
3631                rflow->filter = rc;
3632                if (old_rflow->filter == rflow->filter)
3633                        old_rflow->filter = RPS_NO_FILTER;
3634        out:
3635#endif
3636                rflow->last_qtail =
3637                        per_cpu(softnet_data, next_cpu).input_queue_head;
3638        }
3639
3640        rflow->cpu = next_cpu;
3641        return rflow;
3642}
3643
3644/*
3645 * get_rps_cpu is called from netif_receive_skb and returns the target
3646 * CPU from the RPS map of the receiving queue for a given skb.
3647 * rcu_read_lock must be held on entry.
3648 */
3649static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3650                       struct rps_dev_flow **rflowp)
3651{
3652        const struct rps_sock_flow_table *sock_flow_table;
3653        struct netdev_rx_queue *rxqueue = dev->_rx;
3654        struct rps_dev_flow_table *flow_table;
3655        struct rps_map *map;
3656        int cpu = -1;
3657        u32 tcpu;
3658        u32 hash;
3659
3660        if (skb_rx_queue_recorded(skb)) {
3661                u16 index = skb_get_rx_queue(skb);
3662
3663                if (unlikely(index >= dev->real_num_rx_queues)) {
3664                        WARN_ONCE(dev->real_num_rx_queues > 1,
3665                                  "%s received packet on queue %u, but number "
3666                                  "of RX queues is %u\n",
3667                                  dev->name, index, dev->real_num_rx_queues);
3668                        goto done;
3669                }
3670                rxqueue += index;
3671        }
3672
3673        /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3674
3675        flow_table = rcu_dereference(rxqueue->rps_flow_table);
3676        map = rcu_dereference(rxqueue->rps_map);
3677        if (!flow_table && !map)
3678                goto done;
3679
3680        skb_reset_network_header(skb);
3681        hash = skb_get_hash(skb);
3682        if (!hash)
3683                goto done;
3684
3685        sock_flow_table = rcu_dereference(rps_sock_flow_table);
3686        if (flow_table && sock_flow_table) {
3687                struct rps_dev_flow *rflow;
3688                u32 next_cpu;
3689                u32 ident;
3690
3691                /* First check into global flow table if there is a match */
3692                ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3693                if ((ident ^ hash) & ~rps_cpu_mask)
3694                        goto try_rps;
3695
3696                next_cpu = ident & rps_cpu_mask;
3697
3698                /* OK, now we know there is a match,
3699                 * we can look at the local (per receive queue) flow table
3700                 */
3701                rflow = &flow_table->flows[hash & flow_table->mask];
3702                tcpu = rflow->cpu;
3703
3704                /*
3705                 * If the desired CPU (where last recvmsg was done) is
3706                 * different from current CPU (one in the rx-queue flow
3707                 * table entry), switch if one of the following holds:
3708                 *   - Current CPU is unset (>= nr_cpu_ids).
3709                 *   - Current CPU is offline.
3710                 *   - The current CPU's queue tail has advanced beyond the
3711                 *     last packet that was enqueued using this table entry.
3712                 *     This guarantees that all previous packets for the flow
3713                 *     have been dequeued, thus preserving in order delivery.
3714                 */
3715                if (unlikely(tcpu != next_cpu) &&
3716                    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3717                     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3718                      rflow->last_qtail)) >= 0)) {
3719                        tcpu = next_cpu;
3720                        rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3721                }
3722
3723                if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3724                        *rflowp = rflow;
3725                        cpu = tcpu;
3726                        goto done;
3727                }
3728        }
3729
3730try_rps:
3731
3732        if (map) {
3733                tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3734                if (cpu_online(tcpu)) {
3735                        cpu = tcpu;
3736                        goto done;
3737                }
3738        }
3739
3740done:
3741        return cpu;
3742}
3743
3744#ifdef CONFIG_RFS_ACCEL
3745
3746/**
3747 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3748 * @dev: Device on which the filter was set
3749 * @rxq_index: RX queue index
3750 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3751 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3752 *
3753 * Drivers that implement ndo_rx_flow_steer() should periodically call
3754 * this function for each installed filter and remove the filters for
3755 * which it returns %true.
3756 */
3757bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3758                         u32 flow_id, u16 filter_id)
3759{
3760        struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3761        struct rps_dev_flow_table *flow_table;
3762        struct rps_dev_flow *rflow;
3763        bool expire = true;
3764        unsigned int cpu;
3765
3766        rcu_read_lock();
3767        flow_table = rcu_dereference(rxqueue->rps_flow_table);
3768        if (flow_table && flow_id <= flow_table->mask) {
3769                rflow = &flow_table->flows[flow_id];
3770                cpu = READ_ONCE(rflow->cpu);
3771                if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3772                    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3773                           rflow->last_qtail) <
3774                     (int)(10 * flow_table->mask)))
3775                        expire = false;
3776        }
3777        rcu_read_unlock();
3778        return expire;
3779}
3780EXPORT_SYMBOL(rps_may_expire_flow);
3781
3782#endif /* CONFIG_RFS_ACCEL */
3783
3784/* Called from hardirq (IPI) context */
3785static void rps_trigger_softirq(void *data)
3786{
3787        struct softnet_data *sd = data;
3788
3789        ____napi_schedule(sd, &sd->backlog);
3790        sd->received_rps++;
3791}
3792
3793#endif /* CONFIG_RPS */
3794
3795/*
3796 * Check if this softnet_data structure is another cpu one
3797 * If yes, queue it to our IPI list and return 1
3798 * If no, return 0
3799 */
3800static int rps_ipi_queued(struct softnet_data *sd)
3801{
3802#ifdef CONFIG_RPS
3803        struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3804
3805        if (sd != mysd) {
3806                sd->rps_ipi_next = mysd->rps_ipi_list;
3807                mysd->rps_ipi_list = sd;
3808
3809                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3810                return 1;
3811        }
3812#endif /* CONFIG_RPS */
3813        return 0;
3814}
3815
3816#ifdef CONFIG_NET_FLOW_LIMIT
3817int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3818#endif
3819
3820static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3821{
3822#ifdef CONFIG_NET_FLOW_LIMIT
3823        struct sd_flow_limit *fl;
3824        struct softnet_data *sd;
3825        unsigned int old_flow, new_flow;
3826
3827        if (qlen < (netdev_max_backlog >> 1))
3828                return false;
3829
3830        sd = this_cpu_ptr(&softnet_data);
3831
3832        rcu_read_lock();
3833        fl = rcu_dereference(sd->flow_limit);
3834        if (fl) {
3835                new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3836                old_flow = fl->history[fl->history_head];
3837                fl->history[fl->history_head] = new_flow;
3838
3839                fl->history_head++;
3840                fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3841
3842                if (likely(fl->buckets[old_flow]))
3843                        fl->buckets[old_flow]--;
3844
3845                if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3846                        fl->count++;
3847                        rcu_read_unlock();
3848                        return true;
3849                }
3850        }
3851        rcu_read_unlock();
3852#endif
3853        return false;
3854}
3855
3856/*
3857 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3858 * queue (may be a remote CPU queue).
3859 */
3860static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3861                              unsigned int *qtail)
3862{
3863        struct softnet_data *sd;
3864        unsigned long flags;
3865        unsigned int qlen;
3866
3867        sd = &per_cpu(softnet_data, cpu);
3868
3869        local_irq_save(flags);
3870
3871        rps_lock(sd);
3872        if (!netif_running(skb->dev))
3873                goto drop;
3874        qlen = skb_queue_len(&sd->input_pkt_queue);
3875        if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3876                if (qlen) {
3877enqueue:
3878                        __skb_queue_tail(&sd->input_pkt_queue, skb);
3879                        input_queue_tail_incr_save(sd, qtail);
3880                        rps_unlock(sd);
3881                        local_irq_restore(flags);
3882                        return NET_RX_SUCCESS;
3883                }
3884
3885                /* Schedule NAPI for backlog device
3886                 * We can use non atomic operation since we own the queue lock
3887                 */
3888                if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3889                        if (!rps_ipi_queued(sd))
3890                                ____napi_schedule(sd, &sd->backlog);
3891                }
3892                goto enqueue;
3893        }
3894
3895drop:
3896        sd->dropped++;
3897        rps_unlock(sd);
3898
3899        local_irq_restore(flags);
3900
3901        atomic_long_inc(&skb->dev->rx_dropped);
3902        kfree_skb(skb);
3903        return NET_RX_DROP;
3904}
3905
3906static u32 netif_receive_generic_xdp(struct sk_buff *skb,
3907                                     struct bpf_prog *xdp_prog)
3908{
3909        u32 metalen, act = XDP_DROP;
3910        struct xdp_buff xdp;
3911        void *orig_data;
3912        int hlen, off;
3913        u32 mac_len;
3914
3915        /* Reinjected packets coming from act_mirred or similar should
3916         * not get XDP generic processing.
3917         */
3918        if (skb_cloned(skb))
3919                return XDP_PASS;
3920
3921        /* XDP packets must be linear and must have sufficient headroom
3922         * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
3923         * native XDP provides, thus we need to do it here as well.
3924         */
3925        if (skb_is_nonlinear(skb) ||
3926            skb_headroom(skb) < XDP_PACKET_HEADROOM) {
3927                int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
3928                int troom = skb->tail + skb->data_len - skb->end;
3929
3930                /* In case we have to go down the path and also linearize,
3931                 * then lets do the pskb_expand_head() work just once here.
3932                 */
3933                if (pskb_expand_head(skb,
3934                                     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
3935                                     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
3936                        goto do_drop;
3937                if (skb_linearize(skb))
3938                        goto do_drop;
3939        }
3940
3941        /* The XDP program wants to see the packet starting at the MAC
3942         * header.
3943         */
3944        mac_len = skb->data - skb_mac_header(skb);
3945        hlen = skb_headlen(skb) + mac_len;
3946        xdp.data = skb->data - mac_len;
3947        xdp.data_meta = xdp.data;
3948        xdp.data_end = xdp.data + hlen;
3949        xdp.data_hard_start = skb->data - skb_headroom(skb);
3950        orig_data = xdp.data;
3951
3952        act = bpf_prog_run_xdp(xdp_prog, &xdp);
3953
3954        off = xdp.data - orig_data;
3955        if (off > 0)
3956                __skb_pull(skb, off);
3957        else if (off < 0)
3958                __skb_push(skb, -off);
3959        skb->mac_header += off;
3960
3961        switch (act) {
3962        case XDP_REDIRECT:
3963        case XDP_TX:
3964                __skb_push(skb, mac_len);
3965                break;
3966        case XDP_PASS:
3967                metalen = xdp.data - xdp.data_meta;
3968                if (metalen)
3969                        skb_metadata_set(skb, metalen);
3970                break;
3971        default:
3972                bpf_warn_invalid_xdp_action(act);
3973                /* fall through */
3974        case XDP_ABORTED:
3975                trace_xdp_exception(skb->dev, xdp_prog, act);
3976                /* fall through */
3977        case XDP_DROP:
3978        do_drop:
3979                kfree_skb(skb);
3980                break;
3981        }
3982
3983        return act;
3984}
3985
3986/* When doing generic XDP we have to bypass the qdisc layer and the
3987 * network taps in order to match in-driver-XDP behavior.
3988 */
3989void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
3990{
3991        struct net_device *dev = skb->dev;
3992        struct netdev_queue *txq;
3993        bool free_skb = true;
3994        int cpu, rc;
3995
3996        txq = netdev_pick_tx(dev, skb, NULL);
3997        cpu = smp_processor_id();
3998        HARD_TX_LOCK(dev, txq, cpu);
3999        if (!netif_xmit_stopped(txq)) {
4000                rc = netdev_start_xmit(skb, dev, txq, 0);
4001                if (dev_xmit_complete(rc))
4002                        free_skb = false;
4003        }
4004        HARD_TX_UNLOCK(dev, txq);
4005        if (free_skb) {
4006                trace_xdp_exception(dev, xdp_prog, XDP_TX);
4007                kfree_skb(skb);
4008        }
4009}
4010EXPORT_SYMBOL_GPL(generic_xdp_tx);
4011
4012static struct static_key generic_xdp_needed __read_mostly;
4013
4014int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
4015{
4016        if (xdp_prog) {
4017                u32 act = netif_receive_generic_xdp(skb, xdp_prog);
4018                int err;
4019
4020                if (act != XDP_PASS) {
4021                        switch (act) {
4022                        case XDP_REDIRECT:
4023                                err = xdp_do_generic_redirect(skb->dev, skb,
4024                                                              xdp_prog);
4025                                if (err)
4026                                        goto out_redir;
4027                        /* fallthru to submit skb */
4028                        case XDP_TX:
4029                                generic_xdp_tx(skb, xdp_prog);
4030                                break;
4031                        }
4032                        return XDP_DROP;
4033                }
4034        }
4035        return XDP_PASS;
4036out_redir:
4037        kfree_skb(skb);
4038        return XDP_DROP;
4039}
4040EXPORT_SYMBOL_GPL(do_xdp_generic);
4041
4042static int netif_rx_internal(struct sk_buff *skb)
4043{
4044        int ret;
4045
4046        net_timestamp_check(netdev_tstamp_prequeue, skb);
4047
4048        trace_netif_rx(skb);
4049
4050        if (static_key_false(&generic_xdp_needed)) {
4051                int ret;
4052
4053                preempt_disable();
4054                rcu_read_lock();
4055                ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
4056                rcu_read_unlock();
4057                preempt_enable();
4058
4059                /* Consider XDP consuming the packet a success from
4060                 * the netdev point of view we do not want to count
4061                 * this as an error.
4062                 */
4063                if (ret != XDP_PASS)
4064                        return NET_RX_SUCCESS;
4065        }
4066
4067#ifdef CONFIG_RPS
4068        if (static_key_false(&rps_needed)) {
4069                struct rps_dev_flow voidflow, *rflow = &voidflow;
4070                int cpu;
4071
4072                preempt_disable();
4073                rcu_read_lock();
4074
4075                cpu = get_rps_cpu(skb->dev, skb, &rflow);
4076                if (cpu < 0)
4077                        cpu = smp_processor_id();
4078
4079                ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4080
4081                rcu_read_unlock();
4082                preempt_enable();
4083        } else
4084#endif
4085        {
4086                unsigned int qtail;
4087
4088                ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
4089                put_cpu();
4090        }
4091        return ret;
4092}
4093
4094/**
4095 *      netif_rx        -       post buffer to the network code
4096 *      @skb: buffer to post
4097 *
4098 *      This function receives a packet from a device driver and queues it for
4099 *      the upper (protocol) levels to process.  It always succeeds. The buffer
4100 *      may be dropped during processing for congestion control or by the
4101 *      protocol layers.
4102 *
4103 *      return values:
4104 *      NET_RX_SUCCESS  (no congestion)
4105 *      NET_RX_DROP     (packet was dropped)
4106 *
4107 */
4108
4109int netif_rx(struct sk_buff *skb)
4110{
4111        trace_netif_rx_entry(skb);
4112
4113        return netif_rx_internal(skb);
4114}
4115EXPORT_SYMBOL(netif_rx);
4116
4117int netif_rx_ni(struct sk_buff *skb)
4118{
4119        int err;
4120
4121        trace_netif_rx_ni_entry(skb);
4122
4123        preempt_disable();
4124        err = netif_rx_internal(skb);
4125        if (local_softirq_pending())
4126                do_softirq();
4127        preempt_enable();
4128
4129        return err;
4130}
4131EXPORT_SYMBOL(netif_rx_ni);
4132
4133static __latent_entropy void net_tx_action(struct softirq_action *h)
4134{
4135        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4136
4137        if (sd->completion_queue) {
4138                struct sk_buff *clist;
4139
4140                local_irq_disable();
4141                clist = sd->completion_queue;
4142                sd->completion_queue = NULL;
4143                local_irq_enable();
4144
4145                while (clist) {
4146                        struct sk_buff *skb = clist;
4147
4148                        clist = clist->next;
4149
4150                        WARN_ON(refcount_read(&skb->users));
4151                        if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
4152                                trace_consume_skb(skb);
4153                        else
4154                                trace_kfree_skb(skb, net_tx_action);
4155
4156                        if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
4157                                __kfree_skb(skb);
4158                        else
4159                                __kfree_skb_defer(skb);
4160                }
4161
4162                __kfree_skb_flush();
4163        }
4164
4165        if (sd->output_queue) {
4166                struct Qdisc *head;
4167
4168                local_irq_disable();
4169                head = sd->output_queue;
4170                sd->output_queue = NULL;
4171                sd->output_queue_tailp = &sd->output_queue;
4172                local_irq_enable();
4173
4174                while (head) {
4175                        struct Qdisc *q = head;
4176                        spinlock_t *root_lock;
4177
4178                        head = head->next_sched;
4179
4180                        root_lock = qdisc_lock(q);
4181                        spin_lock(root_lock);
4182                        /* We need to make sure head->next_sched is read
4183                         * before clearing __QDISC_STATE_SCHED
4184                         */
4185                        smp_mb__before_atomic();
4186                        clear_bit(__QDISC_STATE_SCHED, &q->state);
4187                        qdisc_run(q);
4188                        spin_unlock(root_lock);
4189                }
4190        }
4191}
4192
4193#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
4194/* This hook is defined here for ATM LANE */
4195int (*br_fdb_test_addr_hook)(struct net_device *dev,
4196                             unsigned char *addr) __read_mostly;
4197EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
4198#endif
4199
4200static inline struct sk_buff *
4201sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4202                   struct net_device *orig_dev)
4203{
4204#ifdef CONFIG_NET_CLS_ACT
4205        struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
4206        struct tcf_result cl_res;
4207
4208        /* If there's at least one ingress present somewhere (so
4209         * we get here via enabled static key), remaining devices
4210         * that are not configured with an ingress qdisc will bail
4211         * out here.
4212         */
4213        if (!miniq)
4214                return skb;
4215
4216        if (*pt_prev) {
4217                *ret = deliver_skb(skb, *pt_prev, orig_dev);
4218                *pt_prev = NULL;
4219        }
4220
4221        qdisc_skb_cb(skb)->pkt_len = skb->len;
4222        skb->tc_at_ingress = 1;
4223        mini_qdisc_bstats_cpu_update(miniq, skb);
4224
4225        switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
4226        case TC_ACT_OK:
4227        case TC_ACT_RECLASSIFY:
4228                skb->tc_index = TC_H_MIN(cl_res.classid);
4229                break;
4230        case TC_ACT_SHOT:
4231                mini_qdisc_qstats_cpu_drop(miniq);
4232                kfree_skb(skb);
4233                return NULL;
4234        case TC_ACT_STOLEN:
4235        case TC_ACT_QUEUED:
4236        case TC_ACT_TRAP:
4237                consume_skb(skb);
4238                return NULL;
4239        case TC_ACT_REDIRECT:
4240                /* skb_mac_header check was done by cls/act_bpf, so
4241                 * we can safely push the L2 header back before
4242                 * redirecting to another netdev
4243                 */
4244                __skb_push(skb, skb->mac_len);
4245                skb_do_redirect(skb);
4246                return NULL;
4247        default:
4248                break;
4249        }
4250#endif /* CONFIG_NET_CLS_ACT */
4251        return skb;
4252}
4253
4254/**
4255 *      netdev_is_rx_handler_busy - check if receive handler is registered
4256 *      @dev: device to check
4257 *
4258 *      Check if a receive handler is already registered for a given device.
4259 *      Return true if there one.
4260 *
4261 *      The caller must hold the rtnl_mutex.
4262 */
4263bool netdev_is_rx_handler_busy(struct net_device *dev)
4264{
4265        ASSERT_RTNL();
4266        return dev && rtnl_dereference(dev->rx_handler);
4267}
4268EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
4269
4270/**
4271 *      netdev_rx_handler_register - register receive handler
4272 *      @dev: device to register a handler for
4273 *      @rx_handler: receive handler to register
4274 *      @rx_handler_data: data pointer that is used by rx handler
4275 *
4276 *      Register a receive handler for a device. This handler will then be
4277 *      called from __netif_receive_skb. A negative errno code is returned
4278 *      on a failure.
4279 *
4280 *      The caller must hold the rtnl_mutex.
4281 *
4282 *      For a general description of rx_handler, see enum rx_handler_result.
4283 */
4284int netdev_rx_handler_register(struct net_device *dev,
4285                               rx_handler_func_t *rx_handler,
4286                               void *rx_handler_data)
4287{
4288        if (netdev_is_rx_handler_busy(dev))
4289                return -EBUSY;
4290
4291        /* Note: rx_handler_data must be set before rx_handler */
4292        rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4293        rcu_assign_pointer(dev->rx_handler, rx_handler);
4294
4295        return 0;
4296}
4297EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4298
4299/**
4300 *      netdev_rx_handler_unregister - unregister receive handler
4301 *      @dev: device to unregister a handler from
4302 *
4303 *      Unregister a receive handler from a device.
4304 *
4305 *      The caller must hold the rtnl_mutex.
4306 */
4307void netdev_rx_handler_unregister(struct net_device *dev)
4308{
4309
4310        ASSERT_RTNL();
4311        RCU_INIT_POINTER(dev->rx_handler, NULL);
4312        /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4313         * section has a guarantee to see a non NULL rx_handler_data
4314         * as well.
4315         */
4316        synchronize_net();
4317        RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4318}
4319EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4320
4321/*
4322 * Limit the use of PFMEMALLOC reserves to those protocols that implement
4323 * the special handling of PFMEMALLOC skbs.
4324 */
4325static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4326{
4327        switch (skb->protocol) {
4328        case htons(ETH_P_ARP):
4329        case htons(ETH_P_IP):
4330        case htons(ETH_P_IPV6):
4331        case htons(ETH_P_8021Q):
4332        case htons(ETH_P_8021AD):
4333                return true;
4334        default:
4335                return false;
4336        }
4337}
4338
4339static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4340                             int *ret, struct net_device *orig_dev)
4341{
4342#ifdef CONFIG_NETFILTER_INGRESS
4343        if (nf_hook_ingress_active(skb)) {
4344                int ingress_retval;
4345
4346                if (*pt_prev) {
4347                        *ret = deliver_skb(skb, *pt_prev, orig_dev);
4348                        *pt_prev = NULL;
4349                }
4350
4351                rcu_read_lock();
4352                ingress_retval = nf_hook_ingress(skb);
4353                rcu_read_unlock();
4354                return ingress_retval;
4355        }
4356#endif /* CONFIG_NETFILTER_INGRESS */
4357        return 0;
4358}
4359
4360static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4361{
4362        struct packet_type *ptype, *pt_prev;
4363        rx_handler_func_t *rx_handler;
4364        struct net_device *orig_dev;
4365        bool deliver_exact = false;
4366        int ret = NET_RX_DROP;
4367        __be16 type;
4368
4369        net_timestamp_check(!netdev_tstamp_prequeue, skb);
4370
4371        trace_netif_receive_skb(skb);
4372
4373        orig_dev = skb->dev;
4374
4375        skb_reset_network_header(skb);
4376        if (!skb_transport_header_was_set(skb))
4377                skb_reset_transport_header(skb);
4378        skb_reset_mac_len(skb);
4379
4380        pt_prev = NULL;
4381
4382another_round:
4383        skb->skb_iif = skb->dev->ifindex;
4384
4385        __this_cpu_inc(softnet_data.processed);
4386
4387        if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4388            skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4389                skb = skb_vlan_untag(skb);
4390                if (unlikely(!skb))
4391                        goto out;
4392        }
4393
4394        if (skb_skip_tc_classify(skb))
4395                goto skip_classify;
4396
4397        if (pfmemalloc)
4398                goto skip_taps;
4399
4400        list_for_each_entry_rcu(ptype, &ptype_all, list) {
4401                if (pt_prev)
4402                        ret = deliver_skb(skb, pt_prev, orig_dev);
4403                pt_prev = ptype;
4404        }
4405
4406        list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4407                if (pt_prev)
4408                        ret = deliver_skb(skb, pt_prev, orig_dev);
4409                pt_prev = ptype;
4410        }
4411
4412skip_taps:
4413#ifdef CONFIG_NET_INGRESS
4414        if (static_key_false(&ingress_needed)) {
4415                skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4416                if (!skb)
4417                        goto out;
4418
4419                if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4420                        goto out;
4421        }
4422#endif
4423        skb_reset_tc(skb);
4424skip_classify:
4425        if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4426                goto drop;
4427
4428        if (skb_vlan_tag_present(skb)) {
4429                if (pt_prev) {
4430                        ret = deliver_skb(skb, pt_prev, orig_dev);
4431                        pt_prev = NULL;
4432                }
4433                if (vlan_do_receive(&skb))
4434                        goto another_round;
4435                else if (unlikely(!skb))
4436                        goto out;
4437        }
4438
4439        rx_handler = rcu_dereference(skb->dev->rx_handler);
4440        if (rx_handler) {
4441                if (pt_prev) {
4442                        ret = deliver_skb(skb, pt_prev, orig_dev);
4443                        pt_prev = NULL;
4444                }
4445                switch (rx_handler(&skb)) {
4446                case RX_HANDLER_CONSUMED:
4447                        ret = NET_RX_SUCCESS;
4448                        goto out;
4449                case RX_HANDLER_ANOTHER:
4450                        goto another_round;
4451                case RX_HANDLER_EXACT:
4452                        deliver_exact = true;
4453                case RX_HANDLER_PASS:
4454                        break;
4455                default:
4456                        BUG();
4457                }
4458        }
4459
4460        if (unlikely(skb_vlan_tag_present(skb))) {
4461                if (skb_vlan_tag_get_id(skb))
4462                        skb->pkt_type = PACKET_OTHERHOST;
4463                /* Note: we might in the future use prio bits
4464                 * and set skb->priority like in vlan_do_receive()
4465                 * For the time being, just ignore Priority Code Point
4466                 */
4467                skb->vlan_tci = 0;
4468        }
4469
4470        type = skb->protocol;
4471
4472        /* deliver only exact match when indicated */
4473        if (likely(!deliver_exact)) {
4474                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4475                                       &ptype_base[ntohs(type) &
4476                                                   PTYPE_HASH_MASK]);
4477        }
4478
4479        deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4480                               &orig_dev->ptype_specific);
4481
4482        if (unlikely(skb->dev != orig_dev)) {
4483                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4484                                       &skb->dev->ptype_specific);
4485        }
4486
4487        if (pt_prev) {
4488                if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
4489                        goto drop;
4490                else
4491                        ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4492        } else {
4493drop:
4494                if (!deliver_exact)
4495                        atomic_long_inc(&skb->dev->rx_dropped);
4496                else
4497                        atomic_long_inc(&skb->dev->rx_nohandler);
4498                kfree_skb(skb);
4499                /* Jamal, now you will not able to escape explaining
4500                 * me how you were going to use this. :-)
4501                 */
4502                ret = NET_RX_DROP;
4503        }
4504
4505out:
4506        return ret;
4507}
4508
4509/**
4510 *      netif_receive_skb_core - special purpose version of netif_receive_skb
4511 *      @skb: buffer to process
4512 *
4513 *      More direct receive version of netif_receive_skb().  It should
4514 *      only be used by callers that have a need to skip RPS and Generic XDP.
4515 *      Caller must also take care of handling if (page_is_)pfmemalloc.
4516 *
4517 *      This function may only be called from softirq context and interrupts
4518 *      should be enabled.
4519 *
4520 *      Return values (usually ignored):
4521 *      NET_RX_SUCCESS: no congestion
4522 *      NET_RX_DROP: packet was dropped
4523 */
4524int netif_receive_skb_core(struct sk_buff *skb)
4525{
4526        int ret;
4527
4528        rcu_read_lock();
4529        ret = __netif_receive_skb_core(skb, false);
4530        rcu_read_unlock();
4531
4532        return ret;
4533}
4534EXPORT_SYMBOL(netif_receive_skb_core);
4535
4536static int __netif_receive_skb(struct sk_buff *skb)
4537{
4538        int ret;
4539
4540        if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4541                unsigned int noreclaim_flag;
4542
4543                /*
4544                 * PFMEMALLOC skbs are special, they should
4545                 * - be delivered to SOCK_MEMALLOC sockets only
4546                 * - stay away from userspace
4547                 * - have bounded memory usage
4548                 *
4549                 * Use PF_MEMALLOC as this saves us from propagating the allocation
4550                 * context down to all allocation sites.
4551                 */
4552                noreclaim_flag = memalloc_noreclaim_save();
4553                ret = __netif_receive_skb_core(skb, true);
4554                memalloc_noreclaim_restore(noreclaim_flag);
4555        } else
4556                ret = __netif_receive_skb_core(skb, false);
4557
4558        return ret;
4559}
4560
4561static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
4562{
4563        struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
4564        struct bpf_prog *new = xdp->prog;
4565        int ret = 0;
4566
4567        switch (xdp->command) {
4568        case XDP_SETUP_PROG:
4569                rcu_assign_pointer(dev->xdp_prog, new);
4570                if (old)
4571                        bpf_prog_put(old);
4572
4573                if (old && !new) {
4574                        static_key_slow_dec(&generic_xdp_needed);
4575                } else if (new && !old) {
4576                        static_key_slow_inc(&generic_xdp_needed);
4577                        dev_disable_lro(dev);
4578                }
4579                break;
4580
4581        case XDP_QUERY_PROG:
4582                xdp->prog_attached = !!old;
4583                xdp->prog_id = old ? old->aux->id : 0;
4584                break;
4585
4586        default:
4587                ret = -EINVAL;
4588                break;
4589        }
4590
4591        return ret;
4592}
4593
4594static int netif_receive_skb_internal(struct sk_buff *skb)
4595{
4596        int ret;
4597
4598        net_timestamp_check(netdev_tstamp_prequeue, skb);
4599
4600        if (skb_defer_rx_timestamp(skb))
4601                return NET_RX_SUCCESS;
4602
4603        if (static_key_false(&generic_xdp_needed)) {
4604                int ret;
4605
4606                preempt_disable();
4607                rcu_read_lock();
4608                ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
4609                rcu_read_unlock();
4610                preempt_enable();
4611
4612                if (ret != XDP_PASS)
4613                        return NET_RX_DROP;
4614        }
4615
4616        rcu_read_lock();
4617#ifdef CONFIG_RPS
4618        if (static_key_false(&rps_needed)) {
4619                struct rps_dev_flow voidflow, *rflow = &voidflow;
4620                int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4621
4622                if (cpu >= 0) {
4623                        ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4624                        rcu_read_unlock();
4625                        return ret;
4626                }
4627        }
4628#endif
4629        ret = __netif_receive_skb(skb);
4630        rcu_read_unlock();
4631        return ret;
4632}
4633
4634/**
4635 *      netif_receive_skb - process receive buffer from network
4636 *      @skb: buffer to process
4637 *
4638 *      netif_receive_skb() is the main receive data processing function.
4639 *      It always succeeds. The buffer may be dropped during processing
4640 *      for congestion control or by the protocol layers.
4641 *
4642 *      This function may only be called from softirq context and interrupts
4643 *      should be enabled.
4644 *
4645 *      Return values (usually ignored):
4646 *      NET_RX_SUCCESS: no congestion
4647 *      NET_RX_DROP: packet was dropped
4648 */
4649int netif_receive_skb(struct sk_buff *skb)
4650{
4651        trace_netif_receive_skb_entry(skb);
4652
4653        return netif_receive_skb_internal(skb);
4654}
4655EXPORT_SYMBOL(netif_receive_skb);
4656
4657DEFINE_PER_CPU(struct work_struct, flush_works);
4658
4659/* Network device is going away, flush any packets still pending */
4660static void flush_backlog(struct work_struct *work)
4661{
4662        struct sk_buff *skb, *tmp;
4663        struct softnet_data *sd;
4664
4665        local_bh_disable();
4666        sd = this_cpu_ptr(&softnet_data);
4667
4668        local_irq_disable();
4669        rps_lock(sd);
4670        skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4671                if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4672                        __skb_unlink(skb, &sd->input_pkt_queue);
4673                        kfree_skb(skb);
4674                        input_queue_head_incr(sd);
4675                }
4676        }
4677        rps_unlock(sd);
4678        local_irq_enable();
4679
4680        skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4681                if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4682                        __skb_unlink(skb, &sd->process_queue);
4683                        kfree_skb(skb);
4684                        input_queue_head_incr(sd);
4685                }
4686        }
4687        local_bh_enable();
4688}
4689
4690static void flush_all_backlogs(void)
4691{
4692        unsigned int cpu;
4693
4694        get_online_cpus();
4695
4696        for_each_online_cpu(cpu)
4697                queue_work_on(cpu, system_highpri_wq,
4698                              per_cpu_ptr(&flush_works, cpu));
4699
4700        for_each_online_cpu(cpu)
4701                flush_work(per_cpu_ptr(&flush_works, cpu));
4702
4703        put_online_cpus();
4704}
4705
4706static int napi_gro_complete(struct sk_buff *skb)
4707{
4708        struct packet_offload *ptype;
4709        __be16 type = skb->protocol;
4710        struct list_head *head = &offload_base;
4711        int err = -ENOENT;
4712
4713        BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4714
4715        if (NAPI_GRO_CB(skb)->count == 1) {
4716                skb_shinfo(skb)->gso_size = 0;
4717                goto out;
4718        }
4719
4720        rcu_read_lock();
4721        list_for_each_entry_rcu(ptype, head, list) {
4722                if (ptype->type != type || !ptype->callbacks.gro_complete)
4723                        continue;
4724
4725                err = ptype->callbacks.gro_complete(skb, 0);
4726                break;
4727        }
4728        rcu_read_unlock();
4729
4730        if (err) {
4731                WARN_ON(&ptype->list == head);
4732                kfree_skb(skb);
4733                return NET_RX_SUCCESS;
4734        }
4735
4736out:
4737        return netif_receive_skb_internal(skb);
4738}
4739
4740/* napi->gro_list contains packets ordered by age.
4741 * youngest packets at the head of it.
4742 * Complete skbs in reverse order to reduce latencies.
4743 */
4744void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4745{
4746        struct sk_buff *skb, *prev = NULL;
4747
4748        /* scan list and build reverse chain */
4749        for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4750                skb->prev = prev;
4751                prev = skb;
4752        }
4753
4754        for (skb = prev; skb; skb = prev) {
4755                skb->next = NULL;
4756
4757                if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4758                        return;
4759
4760                prev = skb->prev;
4761                napi_gro_complete(skb);
4762                napi->gro_count--;
4763        }
4764
4765        napi->gro_list = NULL;
4766}
4767EXPORT_SYMBOL(napi_gro_flush);
4768
4769static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4770{
4771        struct sk_buff *p;
4772        unsigned int maclen = skb->dev->hard_header_len;
4773        u32 hash = skb_get_hash_raw(skb);
4774
4775        for (p = napi->gro_list; p; p = p->next) {
4776                unsigned long diffs;
4777
4778                NAPI_GRO_CB(p)->flush = 0;
4779
4780                if (hash != skb_get_hash_raw(p)) {
4781                        NAPI_GRO_CB(p)->same_flow = 0;
4782                        continue;
4783                }
4784
4785                diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4786                diffs |= p->vlan_tci ^ skb->vlan_tci;
4787                diffs |= skb_metadata_dst_cmp(p, skb);
4788                diffs |= skb_metadata_differs(p, skb);
4789                if (maclen == ETH_HLEN)
4790                        diffs |= compare_ether_header(skb_mac_header(p),
4791                                                      skb_mac_header(skb));
4792                else if (!diffs)
4793                        diffs = memcmp(skb_mac_header(p),
4794                                       skb_mac_header(skb),
4795                                       maclen);
4796                NAPI_GRO_CB(p)->same_flow = !diffs;
4797        }
4798}
4799
4800static void skb_gro_reset_offset(struct sk_buff *skb)
4801{
4802        const struct skb_shared_info *pinfo = skb_shinfo(skb);
4803        const skb_frag_t *frag0 = &pinfo->frags[0];
4804
4805        NAPI_GRO_CB(skb)->data_offset = 0;
4806        NAPI_GRO_CB(skb)->frag0 = NULL;
4807        NAPI_GRO_CB(skb)->frag0_len = 0;
4808
4809        if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4810            pinfo->nr_frags &&
4811            !PageHighMem(skb_frag_page(frag0))) {
4812                NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4813                NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4814                                                    skb_frag_size(frag0),
4815                                                    skb->end - skb->tail);
4816        }
4817}
4818
4819static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4820{
4821        struct skb_shared_info *pinfo = skb_shinfo(skb);
4822
4823        BUG_ON(skb->end - skb->tail < grow);
4824
4825        memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4826
4827        skb->data_len -= grow;
4828        skb->tail += grow;
4829
4830        pinfo->frags[0].page_offset += grow;
4831        skb_frag_size_sub(&pinfo->frags[0], grow);
4832
4833        if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4834                skb_frag_unref(skb, 0);
4835                memmove(pinfo->frags, pinfo->frags + 1,
4836                        --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4837        }
4838}
4839
4840static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4841{
4842        struct sk_buff **pp = NULL;
4843        struct packet_offload *ptype;
4844        __be16 type = skb->protocol;
4845        struct list_head *head = &offload_base;
4846        int same_flow;
4847        enum gro_result ret;
4848        int grow;
4849
4850        if (netif_elide_gro(skb->dev))
4851                goto normal;
4852
4853        gro_list_prepare(napi, skb);
4854
4855        rcu_read_lock();
4856        list_for_each_entry_rcu(ptype, head, list) {
4857                if (ptype->type != type || !ptype->callbacks.gro_receive)
4858                        continue;
4859
4860                skb_set_network_header(skb, skb_gro_offset(skb));
4861                skb_reset_mac_len(skb);
4862                NAPI_GRO_CB(skb)->same_flow = 0;
4863                NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
4864                NAPI_GRO_CB(skb)->free = 0;
4865                NAPI_GRO_CB(skb)->encap_mark = 0;
4866                NAPI_GRO_CB(skb)->recursion_counter = 0;
4867                NAPI_GRO_CB(skb)->is_fou = 0;
4868                NAPI_GRO_CB(skb)->is_atomic = 1;
4869                NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4870
4871                /* Setup for GRO checksum validation */
4872                switch (skb->ip_summed) {
4873                case CHECKSUM_COMPLETE:
4874                        NAPI_GRO_CB(skb)->csum = skb->csum;
4875                        NAPI_GRO_CB(skb)->csum_valid = 1;
4876                        NAPI_GRO_CB(skb)->csum_cnt = 0;
4877                        break;
4878                case CHECKSUM_UNNECESSARY:
4879                        NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4880                        NAPI_GRO_CB(skb)->csum_valid = 0;
4881                        break;
4882                default:
4883                        NAPI_GRO_CB(skb)->csum_cnt = 0;
4884                        NAPI_GRO_CB(skb)->csum_valid = 0;
4885                }
4886
4887                pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4888                break;
4889        }
4890        rcu_read_unlock();
4891
4892        if (&ptype->list == head)
4893                goto normal;
4894
4895        if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) {
4896                ret = GRO_CONSUMED;
4897                goto ok;
4898        }
4899
4900        same_flow = NAPI_GRO_CB(skb)->same_flow;
4901        ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4902
4903        if (pp) {
4904                struct sk_buff *nskb = *pp;
4905
4906                *pp = nskb->next;
4907                nskb->next = NULL;
4908                napi_gro_complete(nskb);
4909                napi->gro_count--;
4910        }
4911
4912        if (same_flow)
4913                goto ok;
4914
4915        if (NAPI_GRO_CB(skb)->flush)
4916                goto normal;
4917
4918        if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4919                struct sk_buff *nskb = napi->gro_list;
4920
4921                /* locate the end of the list to select the 'oldest' flow */
4922                while (nskb->next) {
4923                        pp = &nskb->next;
4924                        nskb = *pp;
4925                }
4926                *pp = NULL;
4927                nskb->next = NULL;
4928                napi_gro_complete(nskb);
4929        } else {
4930                napi->gro_count++;
4931        }
4932        NAPI_GRO_CB(skb)->count = 1;
4933        NAPI_GRO_CB(skb)->age = jiffies;
4934        NAPI_GRO_CB(skb)->last = skb;
4935        skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4936        skb->next = napi->gro_list;
4937        napi->gro_list = skb;
4938        ret = GRO_HELD;
4939
4940pull:
4941        grow = skb_gro_offset(skb) - skb_headlen(skb);
4942        if (grow > 0)
4943                gro_pull_from_frag0(skb, grow);
4944ok:
4945        return ret;
4946
4947normal:
4948        ret = GRO_NORMAL;
4949        goto pull;
4950}
4951
4952struct packet_offload *gro_find_receive_by_type(__be16 type)
4953{
4954        struct list_head *offload_head = &offload_base;
4955        struct packet_offload *ptype;
4956
4957        list_for_each_entry_rcu(ptype, offload_head, list) {
4958                if (ptype->type != type || !ptype->callbacks.gro_receive)
4959                        continue;
4960                return ptype;
4961        }
4962        return NULL;
4963}
4964EXPORT_SYMBOL(gro_find_receive_by_type);
4965
4966struct packet_offload *gro_find_complete_by_type(__be16 type)
4967{
4968        struct list_head *offload_head = &offload_base;
4969        struct packet_offload *ptype;
4970
4971        list_for_each_entry_rcu(ptype, offload_head, list) {
4972                if (ptype->type != type || !ptype->callbacks.gro_complete)
4973                        continue;
4974                return ptype;
4975        }
4976        return NULL;
4977}
4978EXPORT_SYMBOL(gro_find_complete_by_type);
4979
4980static void napi_skb_free_stolen_head(struct sk_buff *skb)
4981{
4982        skb_dst_drop(skb);
4983        secpath_reset(skb);
4984        kmem_cache_free(skbuff_head_cache, skb);
4985}
4986
4987static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4988{
4989        switch (ret) {
4990        case GRO_NORMAL:
4991                if (netif_receive_skb_internal(skb))
4992                        ret = GRO_DROP;
4993                break;
4994
4995        case GRO_DROP:
4996                kfree_skb(skb);
4997                break;
4998
4999        case GRO_MERGED_FREE:
5000                if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
5001                        napi_skb_free_stolen_head(skb);
5002                else
5003                        __kfree_skb(skb);
5004                break;
5005
5006        case GRO_HELD:
5007        case GRO_MERGED:
5008        case GRO_CONSUMED:
5009                break;
5010        }
5011
5012        return ret;
5013}
5014
5015gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5016{
5017        skb_mark_napi_id(skb, napi);
5018        trace_napi_gro_receive_entry(skb);
5019
5020        skb_gro_reset_offset(skb);
5021
5022        return napi_skb_finish(dev_gro_receive(napi, skb), skb);
5023}
5024EXPORT_SYMBOL(napi_gro_receive);
5025
5026static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
5027{
5028        if (unlikely(skb->pfmemalloc)) {
5029                consume_skb(skb);
5030                return;
5031        }
5032        __skb_pull(skb, skb_headlen(skb));
5033        /* restore the reserve we had after netdev_alloc_skb_ip_align() */
5034        skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
5035        skb->vlan_tci = 0;
5036        skb->dev = napi->dev;
5037        skb->skb_iif = 0;
5038        skb->encapsulation = 0;
5039        skb_shinfo(skb)->gso_type = 0;
5040        skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
5041        secpath_reset(skb);
5042
5043        napi->skb = skb;
5044}
5045
5046struct sk_buff *napi_get_frags(struct napi_struct *napi)
5047{
5048        struct sk_buff *skb = napi->skb;
5049
5050        if (!skb) {
5051                skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
5052                if (skb) {
5053                        napi->skb = skb;
5054                        skb_mark_napi_id(skb, napi);
5055                }
5056        }
5057        return skb;
5058}
5059EXPORT_SYMBOL(napi_get_frags);
5060
5061static gro_result_t napi_frags_finish(struct napi_struct *napi,
5062                                      struct sk_buff *skb,
5063                                      gro_result_t ret)
5064{
5065        switch (ret) {
5066        case GRO_NORMAL:
5067        case GRO_HELD:
5068                __skb_push(skb, ETH_HLEN);
5069                skb->protocol = eth_type_trans(skb, skb->dev);
5070                if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
5071                        ret = GRO_DROP;
5072                break;
5073
5074        case GRO_DROP:
5075                napi_reuse_skb(napi, skb);
5076                break;
5077
5078        case GRO_MERGED_FREE:
5079                if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
5080                        napi_skb_free_stolen_head(skb);
5081                else
5082                        napi_reuse_skb(napi, skb);
5083                break;
5084
5085        case GRO_MERGED:
5086        case GRO_CONSUMED:
5087                break;
5088        }
5089
5090        return ret;
5091}
5092
5093/* Upper GRO stack assumes network header starts at gro_offset=0
5094 * Drivers could call both napi_gro_frags() and napi_gro_receive()
5095 * We copy ethernet header into skb->data to have a common layout.
5096 */
5097static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
5098{
5099        struct sk_buff *skb = napi->skb;
5100        const struct ethhdr *eth;
5101        unsigned int hlen = sizeof(*eth);
5102
5103        napi->skb = NULL;
5104
5105        skb_reset_mac_header(skb);
5106        skb_gro_reset_offset(skb);
5107
5108        eth = skb_gro_header_fast(skb, 0);
5109        if (unlikely(skb_gro_header_hard(skb, hlen))) {
5110                eth = skb_gro_header_slow(skb, hlen, 0);
5111                if (unlikely(!eth)) {
5112                        net_warn_ratelimited("%s: dropping impossible skb from %s\n",
5113                                             __func__, napi->dev->name);
5114                        napi_reuse_skb(napi, skb);
5115                        return NULL;
5116                }
5117        } else {
5118                gro_pull_from_frag0(skb, hlen);
5119                NAPI_GRO_CB(skb)->frag0 += hlen;
5120                NAPI_GRO_CB(skb)->frag0_len -= hlen;
5121        }
5122        __skb_pull(skb, hlen);
5123
5124        /*
5125         * This works because the only protocols we care about don't require
5126         * special handling.
5127         * We'll fix it up properly in napi_frags_finish()
5128         */
5129        skb->protocol = eth->h_proto;
5130
5131        return skb;
5132}
5133
5134gro_result_t napi_gro_frags(struct napi_struct *napi)
5135{
5136        struct sk_buff *skb = napi_frags_skb(napi);
5137
5138        if (!skb)
5139                return GRO_DROP;
5140
5141        trace_napi_gro_frags_entry(skb);
5142
5143        return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
5144}
5145EXPORT_SYMBOL(napi_gro_frags);
5146
5147/* Compute the checksum from gro_offset and return the folded value
5148 * after adding in any pseudo checksum.
5149 */
5150__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
5151{
5152        __wsum wsum;
5153        __sum16 sum;
5154
5155        wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
5156
5157        /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
5158        sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
5159        if (likely(!sum)) {
5160                if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
5161                    !skb->csum_complete_sw)
5162                        netdev_rx_csum_fault(skb->dev);
5163        }
5164
5165        NAPI_GRO_CB(skb)->csum = wsum;
5166        NAPI_GRO_CB(skb)->csum_valid = 1;
5167
5168        return sum;
5169}
5170EXPORT_SYMBOL(__skb_gro_checksum_complete);
5171
5172static void net_rps_send_ipi(struct softnet_data *remsd)
5173{
5174#ifdef CONFIG_RPS
5175        while (remsd) {
5176                struct softnet_data *next = remsd->rps_ipi_next;
5177
5178                if (cpu_online(remsd->cpu))
5179                        smp_call_function_single_async(remsd->cpu, &remsd->csd);
5180                remsd = next;
5181        }
5182#endif
5183}
5184
5185/*
5186 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
5187 * Note: called with local irq disabled, but exits with local irq enabled.
5188 */
5189static void net_rps_action_and_irq_enable(struct softnet_data *sd)
5190{
5191#ifdef CONFIG_RPS
5192        struct softnet_data *remsd = sd->rps_ipi_list;
5193
5194        if (remsd) {
5195                sd->rps_ipi_list = NULL;
5196
5197                local_irq_enable();
5198
5199                /* Send pending IPI's to kick RPS processing on remote cpus. */
5200                net_rps_send_ipi(remsd);
5201        } else
5202#endif
5203                local_irq_enable();
5204}
5205
5206static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
5207{
5208#ifdef CONFIG_RPS
5209        return sd->rps_ipi_list != NULL;
5210#else
5211        return false;
5212#endif
5213}
5214
5215static int process_backlog(struct napi_struct *napi, int quota)
5216{
5217        struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
5218        bool again = true;
5219        int work = 0;
5220
5221        /* Check if we have pending ipi, its better to send them now,
5222         * not waiting net_rx_action() end.
5223         */
5224        if (sd_has_rps_ipi_waiting(sd)) {
5225                local_irq_disable();
5226                net_rps_action_and_irq_enable(sd);
5227        }
5228
5229        napi->weight = dev_rx_weight;
5230        while (again) {
5231                struct sk_buff *skb;
5232
5233                while ((skb = __skb_dequeue(&sd->process_queue))) {
5234                        rcu_read_lock();
5235                        __netif_receive_skb(skb);
5236                        rcu_read_unlock();
5237                        input_queue_head_incr(sd);
5238                        if (++work >= quota)
5239                                return work;
5240
5241                }
5242
5243                local_irq_disable();
5244                rps_lock(sd);
5245                if (skb_queue_empty(&sd->input_pkt_queue)) {
5246                        /*
5247                         * Inline a custom version of __napi_complete().
5248                         * only current cpu owns and manipulates this napi,
5249                         * and NAPI_STATE_SCHED is the only possible flag set
5250                         * on backlog.
5251                         * We can use a plain write instead of clear_bit(),
5252                         * and we dont need an smp_mb() memory barrier.
5253                         */
5254                        napi->state = 0;
5255                        again = false;
5256                } else {
5257                        skb_queue_splice_tail_init(&sd->input_pkt_queue,
5258                                                   &sd->process_queue);
5259                }
5260                rps_unlock(sd);
5261                local_irq_enable();
5262        }
5263
5264        return work;
5265}
5266
5267/**
5268 * __napi_schedule - schedule for receive
5269 * @n: entry to schedule
5270 *
5271 * The entry's receive function will be scheduled to run.
5272 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
5273 */
5274void __napi_schedule(struct napi_struct *n)
5275{
5276        unsigned long flags;
5277
5278        local_irq_save(flags);
5279        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
5280        local_irq_restore(flags);
5281}
5282EXPORT_SYMBOL(__napi_schedule);
5283
5284/**
5285 *      napi_schedule_prep - check if napi can be scheduled
5286 *      @n: napi context
5287 *
5288 * Test if NAPI routine is already running, and if not mark
5289 * it as running.  This is used as a condition variable
5290 * insure only one NAPI poll instance runs.  We also make
5291 * sure there is no pending NAPI disable.
5292 */
5293bool napi_schedule_prep(struct napi_struct *n)
5294{
5295        unsigned long val, new;
5296
5297        do {
5298                val = READ_ONCE(n->state);
5299                if (unlikely(val & NAPIF_STATE_DISABLE))
5300                        return false;
5301                new = val | NAPIF_STATE_SCHED;
5302
5303                /* Sets STATE_MISSED bit if STATE_SCHED was already set
5304                 * This was suggested by Alexander Duyck, as compiler
5305                 * emits better code than :
5306                 * if (val & NAPIF_STATE_SCHED)
5307                 *     new |= NAPIF_STATE_MISSED;
5308                 */
5309                new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
5310                                                   NAPIF_STATE_MISSED;
5311        } while (cmpxchg(&n->state, val, new) != val);
5312
5313        return !(val & NAPIF_STATE_SCHED);
5314}
5315EXPORT_SYMBOL(napi_schedule_prep);
5316
5317/**
5318 * __napi_schedule_irqoff - schedule for receive
5319 * @n: entry to schedule
5320 *
5321 * Variant of __napi_schedule() assuming hard irqs are masked
5322 */
5323void __napi_schedule_irqoff(struct napi_struct *n)
5324{
5325        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
5326}
5327EXPORT_SYMBOL(__napi_schedule_irqoff);
5328
5329bool napi_complete_done(struct napi_struct *n, int work_done)
5330{
5331        unsigned long flags, val, new;
5332
5333        /*
5334         * 1) Don't let napi dequeue from the cpu poll list
5335         *    just in case its running on a different cpu.
5336         * 2) If we are busy polling, do nothing here, we have
5337         *    the guarantee we will be called later.
5338         */
5339        if (unlikely(n->state & (NAPIF_STATE_NPSVC |
5340                                 NAPIF_STATE_IN_BUSY_POLL)))
5341                return false;
5342
5343        if (n->gro_list) {
5344                unsigned long timeout = 0;
5345
5346                if (work_done)
5347                        timeout = n->dev->gro_flush_timeout;
5348
5349                if (timeout)
5350                        hrtimer_start(&n->timer, ns_to_ktime(timeout),
5351                                      HRTIMER_MODE_REL_PINNED);
5352                else
5353                        napi_gro_flush(n, false);
5354        }
5355        if (unlikely(!list_empty(&n->poll_list))) {
5356                /* If n->poll_list is not empty, we need to mask irqs */
5357                local_irq_save(flags);
5358                list_del_init(&n->poll_list);
5359                local_irq_restore(flags);
5360        }
5361
5362        do {
5363                val = READ_ONCE(n->state);
5364
5365                WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
5366
5367                new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
5368
5369                /* If STATE_MISSED was set, leave STATE_SCHED set,
5370                 * because we will call napi->poll() one more time.
5371                 * This C code was suggested by Alexander Duyck to help gcc.
5372                 */
5373                new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
5374                                                    NAPIF_STATE_SCHED;
5375        } while (cmpxchg(&n->state, val, new) != val);
5376
5377        if (unlikely(val & NAPIF_STATE_MISSED)) {
5378                __napi_schedule(n);
5379                return false;
5380        }
5381
5382        return true;
5383}
5384EXPORT_SYMBOL(napi_complete_done);
5385
5386/* must be called under rcu_read_lock(), as we dont take a reference */
5387static struct napi_struct *napi_by_id(unsigned int napi_id)
5388{
5389        unsigned int hash = napi_id % HASH_SIZE(napi_hash);
5390        struct napi_struct *napi;
5391
5392        hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
5393                if (napi->napi_id == napi_id)
5394                        return napi;
5395
5396        return NULL;
5397}
5398
5399#if defined(CONFIG_NET_RX_BUSY_POLL)
5400
5401#define BUSY_POLL_BUDGET 8
5402
5403static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
5404{
5405        int rc;
5406
5407        /* Busy polling means there is a high chance device driver hard irq
5408         * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
5409         * set in napi_schedule_prep().
5410         * Since we are about to call napi->poll() once more, we can safely
5411         * clear NAPI_STATE_MISSED.
5412         *
5413         * Note: x86 could use a single "lock and ..." instruction
5414         * to perform these two clear_bit()
5415         */
5416        clear_bit(NAPI_STATE_MISSED, &napi->state);
5417        clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
5418
5419        local_bh_disable();
5420
5421        /* All we really want here is to re-enable device interrupts.
5422         * Ideally, a new ndo_busy_poll_stop() could avoid another round.
5423         */
5424        rc = napi->poll(napi, BUSY_POLL_BUDGET);
5425        trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
5426        netpoll_poll_unlock(have_poll_lock);
5427        if (rc == BUSY_POLL_BUDGET)
5428                __napi_schedule(napi);
5429        local_bh_enable();
5430}
5431
5432void napi_busy_loop(unsigned int napi_id,
5433                    bool (*loop_end)(void *, unsigned long),
5434                    void *loop_end_arg)
5435{
5436        unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
5437        int (*napi_poll)(struct napi_struct *napi, int budget);
5438        void *have_poll_lock = NULL;
5439        struct napi_struct *napi;
5440
5441restart:
5442        napi_poll = NULL;
5443
5444        rcu_read_lock();
5445
5446        napi = napi_by_id(napi_id);
5447        if (!napi)
5448                goto out;
5449
5450        preempt_disable();
5451        for (;;) {
5452                int work = 0;
5453
5454                local_bh_disable();
5455                if (!napi_poll) {
5456                        unsigned long val = READ_ONCE(napi->state);
5457
5458                        /* If multiple threads are competing for this napi,
5459                         * we avoid dirtying napi->state as much as we can.
5460                         */
5461                        if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
5462                                   NAPIF_STATE_IN_BUSY_POLL))
5463                                goto count;
5464                        if (cmpxchg(&napi->state, val,
5465                                    val | NAPIF_STATE_IN_BUSY_POLL |
5466                                          NAPIF_STATE_SCHED) != val)
5467                                goto count;
5468                        have_poll_lock = netpoll_poll_lock(napi);
5469                        napi_poll = napi->poll;
5470                }
5471                work = napi_poll(napi, BUSY_POLL_BUDGET);
5472                trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
5473count:
5474                if (work > 0)
5475                        __NET_ADD_STATS(dev_net(napi->dev),
5476                                        LINUX_MIB_BUSYPOLLRXPACKETS, work);
5477                local_bh_enable();
5478
5479                if (!loop_end || loop_end(loop_end_arg, start_time))
5480                        break;
5481
5482                if (unlikely(need_resched())) {
5483                        if (napi_poll)
5484                                busy_poll_stop(napi, have_poll_lock);
5485                        preempt_enable();
5486                        rcu_read_unlock();
5487                        cond_resched();
5488                        if (loop_end(loop_end_arg, start_time))
5489                                return;
5490                        goto restart;
5491                }
5492                cpu_relax();
5493        }
5494        if (napi_poll)
5495                busy_poll_stop(napi, have_poll_lock);
5496        preempt_enable();
5497out:
5498        rcu_read_unlock();
5499}
5500EXPORT_SYMBOL(napi_busy_loop);
5501
5502#endif /* CONFIG_NET_RX_BUSY_POLL */
5503
5504static void napi_hash_add(struct napi_struct *napi)
5505{
5506        if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5507            test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5508                return;
5509
5510        spin_lock(&napi_hash_lock);
5511
5512        /* 0..NR_CPUS range is reserved for sender_cpu use */
5513        do {
5514                if (unlikely(++napi_gen_id < MIN_NAPI_ID))
5515                        napi_gen_id = MIN_NAPI_ID;
5516        } while (napi_by_id(napi_gen_id));
5517        napi->napi_id = napi_gen_id;
5518
5519        hlist_add_head_rcu(&napi->napi_hash_node,
5520                           &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5521
5522        spin_unlock(&napi_hash_lock);
5523}
5524
5525/* Warning : caller is responsible to make sure rcu grace period