linux/net/core/dev.c
<<
>>
Prefs
   1/*
   2 *      NET3    Protocol independent device support routines.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 *      Derived from the non IP parts of dev.c 1.0.19
  10 *              Authors:        Ross Biro
  11 *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *      Additional Authors:
  15 *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *              David Hinds <dahinds@users.sourceforge.net>
  18 *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *              Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *      Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *                                      to 2 if register_netdev gets called
  25 *                                      before net_dev_init & also removed a
  26 *                                      few lines of code in the process.
  27 *              Alan Cox        :       device private ioctl copies fields back.
  28 *              Alan Cox        :       Transmit queue code does relevant
  29 *                                      stunts to keep the queue safe.
  30 *              Alan Cox        :       Fixed double lock.
  31 *              Alan Cox        :       Fixed promisc NULL pointer trap
  32 *              ????????        :       Support the full private ioctl range
  33 *              Alan Cox        :       Moved ioctl permission check into
  34 *                                      drivers
  35 *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36 *              Alan Cox        :       100 backlog just doesn't cut it when
  37 *                                      you start doing multicast video 8)
  38 *              Alan Cox        :       Rewrote net_bh and list manager.
  39 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40 *              Alan Cox        :       Took out transmit every packet pass
  41 *                                      Saved a few bytes in the ioctl handler
  42 *              Alan Cox        :       Network driver sets packet type before
  43 *                                      calling netif_rx. Saves a function
  44 *                                      call a packet.
  45 *              Alan Cox        :       Hashed net_bh()
  46 *              Richard Kooijman:       Timestamp fixes.
  47 *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48 *              Alan Cox        :       Device lock protection.
  49 *              Alan Cox        :       Fixed nasty side effect of device close
  50 *                                      changes.
  51 *              Rudi Cilibrasi  :       Pass the right thing to
  52 *                                      set_mac_address()
  53 *              Dave Miller     :       32bit quantity for the device lock to
  54 *                                      make it work out on a Sparc.
  55 *              Bjorn Ekwall    :       Added KERNELD hack.
  56 *              Alan Cox        :       Cleaned up the backlog initialise.
  57 *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58 *                                      1 device.
  59 *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60 *                                      is no device open function.
  61 *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62 *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63 *              Cyrus Durgin    :       Cleaned for KMOD
  64 *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65 *                                      A network device unload needs to purge
  66 *                                      the backlog queue.
  67 *      Paul Rusty Russell      :       SIOCSIFNAME
  68 *              Pekka Riikonen  :       Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *                                      indefinitely on dev->refcnt
  71 *              J Hadi Salim    :       - Backlog queue sampling
  72 *                                      - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
  84#include <linux/mutex.h>
  85#include <linux/string.h>
  86#include <linux/mm.h>
  87#include <linux/socket.h>
  88#include <linux/sockios.h>
  89#include <linux/errno.h>
  90#include <linux/interrupt.h>
  91#include <linux/if_ether.h>
  92#include <linux/netdevice.h>
  93#include <linux/etherdevice.h>
  94#include <linux/ethtool.h>
  95#include <linux/notifier.h>
  96#include <linux/skbuff.h>
  97#include <net/net_namespace.h>
  98#include <net/sock.h>
  99#include <net/busy_poll.h>
 100#include <linux/rtnetlink.h>
 101#include <linux/stat.h>
 102#include <net/dst.h>
 103#include <net/dst_metadata.h>
 104#include <net/pkt_sched.h>
 105#include <net/checksum.h>
 106#include <net/xfrm.h>
 107#include <linux/highmem.h>
 108#include <linux/init.h>
 109#include <linux/module.h>
 110#include <linux/netpoll.h>
 111#include <linux/rcupdate.h>
 112#include <linux/delay.h>
 113#include <net/iw_handler.h>
 114#include <asm/current.h>
 115#include <linux/audit.h>
 116#include <linux/dmaengine.h>
 117#include <linux/err.h>
 118#include <linux/ctype.h>
 119#include <linux/if_arp.h>
 120#include <linux/if_vlan.h>
 121#include <linux/ip.h>
 122#include <net/ip.h>
 123#include <net/mpls.h>
 124#include <linux/ipv6.h>
 125#include <linux/in.h>
 126#include <linux/jhash.h>
 127#include <linux/random.h>
 128#include <trace/events/napi.h>
 129#include <trace/events/net.h>
 130#include <trace/events/skb.h>
 131#include <linux/pci.h>
 132#include <linux/inetdevice.h>
 133#include <linux/cpu_rmap.h>
 134#include <linux/static_key.h>
 135#include <linux/hashtable.h>
 136#include <linux/vmalloc.h>
 137#include <linux/if_macvlan.h>
 138#include <linux/errqueue.h>
 139#include <linux/hrtimer.h>
 140#include <linux/netfilter_ingress.h>
 141#include <linux/sctp.h>
 142
 143#include "net-sysfs.h"
 144
 145/* Instead of increasing this, you should create a hash table. */
 146#define MAX_GRO_SKBS 8
 147
 148/* This should be increased if a protocol with a bigger head is added. */
 149#define GRO_MAX_HEAD (MAX_HEADER + 128)
 150
 151static DEFINE_SPINLOCK(ptype_lock);
 152static DEFINE_SPINLOCK(offload_lock);
 153struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 154struct list_head ptype_all __read_mostly;       /* Taps */
 155static struct list_head offload_base __read_mostly;
 156
 157static int netif_rx_internal(struct sk_buff *skb);
 158static int call_netdevice_notifiers_info(unsigned long val,
 159                                         struct net_device *dev,
 160                                         struct netdev_notifier_info *info);
 161
 162/*
 163 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 164 * semaphore.
 165 *
 166 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 167 *
 168 * Writers must hold the rtnl semaphore while they loop through the
 169 * dev_base_head list, and hold dev_base_lock for writing when they do the
 170 * actual updates.  This allows pure readers to access the list even
 171 * while a writer is preparing to update it.
 172 *
 173 * To put it another way, dev_base_lock is held for writing only to
 174 * protect against pure readers; the rtnl semaphore provides the
 175 * protection against other writers.
 176 *
 177 * See, for example usages, register_netdevice() and
 178 * unregister_netdevice(), which must be called with the rtnl
 179 * semaphore held.
 180 */
 181DEFINE_RWLOCK(dev_base_lock);
 182EXPORT_SYMBOL(dev_base_lock);
 183
 184/* protects napi_hash addition/deletion and napi_gen_id */
 185static DEFINE_SPINLOCK(napi_hash_lock);
 186
 187static unsigned int napi_gen_id = NR_CPUS;
 188static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 189
 190static seqcount_t devnet_rename_seq;
 191
 192static inline void dev_base_seq_inc(struct net *net)
 193{
 194        while (++net->dev_base_seq == 0);
 195}
 196
 197static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 198{
 199        unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 200
 201        return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 202}
 203
 204static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 205{
 206        return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 207}
 208
 209static inline void rps_lock(struct softnet_data *sd)
 210{
 211#ifdef CONFIG_RPS
 212        spin_lock(&sd->input_pkt_queue.lock);
 213#endif
 214}
 215
 216static inline void rps_unlock(struct softnet_data *sd)
 217{
 218#ifdef CONFIG_RPS
 219        spin_unlock(&sd->input_pkt_queue.lock);
 220#endif
 221}
 222
 223/* Device list insertion */
 224static void list_netdevice(struct net_device *dev)
 225{
 226        struct net *net = dev_net(dev);
 227
 228        ASSERT_RTNL();
 229
 230        write_lock_bh(&dev_base_lock);
 231        list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 232        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 233        hlist_add_head_rcu(&dev->index_hlist,
 234                           dev_index_hash(net, dev->ifindex));
 235        write_unlock_bh(&dev_base_lock);
 236
 237        dev_base_seq_inc(net);
 238}
 239
 240/* Device list removal
 241 * caller must respect a RCU grace period before freeing/reusing dev
 242 */
 243static void unlist_netdevice(struct net_device *dev)
 244{
 245        ASSERT_RTNL();
 246
 247        /* Unlink dev from the device chain */
 248        write_lock_bh(&dev_base_lock);
 249        list_del_rcu(&dev->dev_list);
 250        hlist_del_rcu(&dev->name_hlist);
 251        hlist_del_rcu(&dev->index_hlist);
 252        write_unlock_bh(&dev_base_lock);
 253
 254        dev_base_seq_inc(dev_net(dev));
 255}
 256
 257/*
 258 *      Our notifier list
 259 */
 260
 261static RAW_NOTIFIER_HEAD(netdev_chain);
 262
 263/*
 264 *      Device drivers call our routines to queue packets here. We empty the
 265 *      queue in the local softnet handler.
 266 */
 267
 268DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 269EXPORT_PER_CPU_SYMBOL(softnet_data);
 270
 271#ifdef CONFIG_LOCKDEP
 272/*
 273 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 274 * according to dev->type
 275 */
 276static const unsigned short netdev_lock_type[] =
 277        {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 278         ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 279         ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 280         ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 281         ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 282         ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 283         ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 284         ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 285         ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 286         ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 287         ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 288         ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 289         ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 290         ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 291         ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 292
 293static const char *const netdev_lock_name[] =
 294        {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 295         "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 296         "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 297         "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 298         "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 299         "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 300         "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 301         "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 302         "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 303         "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 304         "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 305         "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 306         "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 307         "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 308         "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 309
 310static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 311static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 312
 313static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 314{
 315        int i;
 316
 317        for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 318                if (netdev_lock_type[i] == dev_type)
 319                        return i;
 320        /* the last key is used by default */
 321        return ARRAY_SIZE(netdev_lock_type) - 1;
 322}
 323
 324static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 325                                                 unsigned short dev_type)
 326{
 327        int i;
 328
 329        i = netdev_lock_pos(dev_type);
 330        lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 331                                   netdev_lock_name[i]);
 332}
 333
 334static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 335{
 336        int i;
 337
 338        i = netdev_lock_pos(dev->type);
 339        lockdep_set_class_and_name(&dev->addr_list_lock,
 340                                   &netdev_addr_lock_key[i],
 341                                   netdev_lock_name[i]);
 342}
 343#else
 344static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 345                                                 unsigned short dev_type)
 346{
 347}
 348static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 349{
 350}
 351#endif
 352
 353/*******************************************************************************
 354
 355                Protocol management and registration routines
 356
 357*******************************************************************************/
 358
 359/*
 360 *      Add a protocol ID to the list. Now that the input handler is
 361 *      smarter we can dispense with all the messy stuff that used to be
 362 *      here.
 363 *
 364 *      BEWARE!!! Protocol handlers, mangling input packets,
 365 *      MUST BE last in hash buckets and checking protocol handlers
 366 *      MUST start from promiscuous ptype_all chain in net_bh.
 367 *      It is true now, do not change it.
 368 *      Explanation follows: if protocol handler, mangling packet, will
 369 *      be the first on list, it is not able to sense, that packet
 370 *      is cloned and should be copied-on-write, so that it will
 371 *      change it and subsequent readers will get broken packet.
 372 *                                                      --ANK (980803)
 373 */
 374
 375static inline struct list_head *ptype_head(const struct packet_type *pt)
 376{
 377        if (pt->type == htons(ETH_P_ALL))
 378                return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 379        else
 380                return pt->dev ? &pt->dev->ptype_specific :
 381                                 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 382}
 383
 384/**
 385 *      dev_add_pack - add packet handler
 386 *      @pt: packet type declaration
 387 *
 388 *      Add a protocol handler to the networking stack. The passed &packet_type
 389 *      is linked into kernel lists and may not be freed until it has been
 390 *      removed from the kernel lists.
 391 *
 392 *      This call does not sleep therefore it can not
 393 *      guarantee all CPU's that are in middle of receiving packets
 394 *      will see the new packet type (until the next received packet).
 395 */
 396
 397void dev_add_pack(struct packet_type *pt)
 398{
 399        struct list_head *head = ptype_head(pt);
 400
 401        spin_lock(&ptype_lock);
 402        list_add_rcu(&pt->list, head);
 403        spin_unlock(&ptype_lock);
 404}
 405EXPORT_SYMBOL(dev_add_pack);
 406
 407/**
 408 *      __dev_remove_pack        - remove packet handler
 409 *      @pt: packet type declaration
 410 *
 411 *      Remove a protocol handler that was previously added to the kernel
 412 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 413 *      from the kernel lists and can be freed or reused once this function
 414 *      returns.
 415 *
 416 *      The packet type might still be in use by receivers
 417 *      and must not be freed until after all the CPU's have gone
 418 *      through a quiescent state.
 419 */
 420void __dev_remove_pack(struct packet_type *pt)
 421{
 422        struct list_head *head = ptype_head(pt);
 423        struct packet_type *pt1;
 424
 425        spin_lock(&ptype_lock);
 426
 427        list_for_each_entry(pt1, head, list) {
 428                if (pt == pt1) {
 429                        list_del_rcu(&pt->list);
 430                        goto out;
 431                }
 432        }
 433
 434        pr_warn("dev_remove_pack: %p not found\n", pt);
 435out:
 436        spin_unlock(&ptype_lock);
 437}
 438EXPORT_SYMBOL(__dev_remove_pack);
 439
 440/**
 441 *      dev_remove_pack  - remove packet handler
 442 *      @pt: packet type declaration
 443 *
 444 *      Remove a protocol handler that was previously added to the kernel
 445 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 446 *      from the kernel lists and can be freed or reused once this function
 447 *      returns.
 448 *
 449 *      This call sleeps to guarantee that no CPU is looking at the packet
 450 *      type after return.
 451 */
 452void dev_remove_pack(struct packet_type *pt)
 453{
 454        __dev_remove_pack(pt);
 455
 456        synchronize_net();
 457}
 458EXPORT_SYMBOL(dev_remove_pack);
 459
 460
 461/**
 462 *      dev_add_offload - register offload handlers
 463 *      @po: protocol offload declaration
 464 *
 465 *      Add protocol offload handlers to the networking stack. The passed
 466 *      &proto_offload is linked into kernel lists and may not be freed until
 467 *      it has been removed from the kernel lists.
 468 *
 469 *      This call does not sleep therefore it can not
 470 *      guarantee all CPU's that are in middle of receiving packets
 471 *      will see the new offload handlers (until the next received packet).
 472 */
 473void dev_add_offload(struct packet_offload *po)
 474{
 475        struct packet_offload *elem;
 476
 477        spin_lock(&offload_lock);
 478        list_for_each_entry(elem, &offload_base, list) {
 479                if (po->priority < elem->priority)
 480                        break;
 481        }
 482        list_add_rcu(&po->list, elem->list.prev);
 483        spin_unlock(&offload_lock);
 484}
 485EXPORT_SYMBOL(dev_add_offload);
 486
 487/**
 488 *      __dev_remove_offload     - remove offload handler
 489 *      @po: packet offload declaration
 490 *
 491 *      Remove a protocol offload handler that was previously added to the
 492 *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 493 *      is removed from the kernel lists and can be freed or reused once this
 494 *      function returns.
 495 *
 496 *      The packet type might still be in use by receivers
 497 *      and must not be freed until after all the CPU's have gone
 498 *      through a quiescent state.
 499 */
 500static void __dev_remove_offload(struct packet_offload *po)
 501{
 502        struct list_head *head = &offload_base;
 503        struct packet_offload *po1;
 504
 505        spin_lock(&offload_lock);
 506
 507        list_for_each_entry(po1, head, list) {
 508                if (po == po1) {
 509                        list_del_rcu(&po->list);
 510                        goto out;
 511                }
 512        }
 513
 514        pr_warn("dev_remove_offload: %p not found\n", po);
 515out:
 516        spin_unlock(&offload_lock);
 517}
 518
 519/**
 520 *      dev_remove_offload       - remove packet offload handler
 521 *      @po: packet offload declaration
 522 *
 523 *      Remove a packet offload handler that was previously added to the kernel
 524 *      offload handlers by dev_add_offload(). The passed &offload_type is
 525 *      removed from the kernel lists and can be freed or reused once this
 526 *      function returns.
 527 *
 528 *      This call sleeps to guarantee that no CPU is looking at the packet
 529 *      type after return.
 530 */
 531void dev_remove_offload(struct packet_offload *po)
 532{
 533        __dev_remove_offload(po);
 534
 535        synchronize_net();
 536}
 537EXPORT_SYMBOL(dev_remove_offload);
 538
 539/******************************************************************************
 540
 541                      Device Boot-time Settings Routines
 542
 543*******************************************************************************/
 544
 545/* Boot time configuration table */
 546static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 547
 548/**
 549 *      netdev_boot_setup_add   - add new setup entry
 550 *      @name: name of the device
 551 *      @map: configured settings for the device
 552 *
 553 *      Adds new setup entry to the dev_boot_setup list.  The function
 554 *      returns 0 on error and 1 on success.  This is a generic routine to
 555 *      all netdevices.
 556 */
 557static int netdev_boot_setup_add(char *name, struct ifmap *map)
 558{
 559        struct netdev_boot_setup *s;
 560        int i;
 561
 562        s = dev_boot_setup;
 563        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 564                if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 565                        memset(s[i].name, 0, sizeof(s[i].name));
 566                        strlcpy(s[i].name, name, IFNAMSIZ);
 567                        memcpy(&s[i].map, map, sizeof(s[i].map));
 568                        break;
 569                }
 570        }
 571
 572        return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 573}
 574
 575/**
 576 *      netdev_boot_setup_check - check boot time settings
 577 *      @dev: the netdevice
 578 *
 579 *      Check boot time settings for the device.
 580 *      The found settings are set for the device to be used
 581 *      later in the device probing.
 582 *      Returns 0 if no settings found, 1 if they are.
 583 */
 584int netdev_boot_setup_check(struct net_device *dev)
 585{
 586        struct netdev_boot_setup *s = dev_boot_setup;
 587        int i;
 588
 589        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 590                if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 591                    !strcmp(dev->name, s[i].name)) {
 592                        dev->irq        = s[i].map.irq;
 593                        dev->base_addr  = s[i].map.base_addr;
 594                        dev->mem_start  = s[i].map.mem_start;
 595                        dev->mem_end    = s[i].map.mem_end;
 596                        return 1;
 597                }
 598        }
 599        return 0;
 600}
 601EXPORT_SYMBOL(netdev_boot_setup_check);
 602
 603
 604/**
 605 *      netdev_boot_base        - get address from boot time settings
 606 *      @prefix: prefix for network device
 607 *      @unit: id for network device
 608 *
 609 *      Check boot time settings for the base address of device.
 610 *      The found settings are set for the device to be used
 611 *      later in the device probing.
 612 *      Returns 0 if no settings found.
 613 */
 614unsigned long netdev_boot_base(const char *prefix, int unit)
 615{
 616        const struct netdev_boot_setup *s = dev_boot_setup;
 617        char name[IFNAMSIZ];
 618        int i;
 619
 620        sprintf(name, "%s%d", prefix, unit);
 621
 622        /*
 623         * If device already registered then return base of 1
 624         * to indicate not to probe for this interface
 625         */
 626        if (__dev_get_by_name(&init_net, name))
 627                return 1;
 628
 629        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 630                if (!strcmp(name, s[i].name))
 631                        return s[i].map.base_addr;
 632        return 0;
 633}
 634
 635/*
 636 * Saves at boot time configured settings for any netdevice.
 637 */
 638int __init netdev_boot_setup(char *str)
 639{
 640        int ints[5];
 641        struct ifmap map;
 642
 643        str = get_options(str, ARRAY_SIZE(ints), ints);
 644        if (!str || !*str)
 645                return 0;
 646
 647        /* Save settings */
 648        memset(&map, 0, sizeof(map));
 649        if (ints[0] > 0)
 650                map.irq = ints[1];
 651        if (ints[0] > 1)
 652                map.base_addr = ints[2];
 653        if (ints[0] > 2)
 654                map.mem_start = ints[3];
 655        if (ints[0] > 3)
 656                map.mem_end = ints[4];
 657
 658        /* Add new entry to the list */
 659        return netdev_boot_setup_add(str, &map);
 660}
 661
 662__setup("netdev=", netdev_boot_setup);
 663
 664/*******************************************************************************
 665
 666                            Device Interface Subroutines
 667
 668*******************************************************************************/
 669
 670/**
 671 *      dev_get_iflink  - get 'iflink' value of a interface
 672 *      @dev: targeted interface
 673 *
 674 *      Indicates the ifindex the interface is linked to.
 675 *      Physical interfaces have the same 'ifindex' and 'iflink' values.
 676 */
 677
 678int dev_get_iflink(const struct net_device *dev)
 679{
 680        if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 681                return dev->netdev_ops->ndo_get_iflink(dev);
 682
 683        return dev->ifindex;
 684}
 685EXPORT_SYMBOL(dev_get_iflink);
 686
 687/**
 688 *      dev_fill_metadata_dst - Retrieve tunnel egress information.
 689 *      @dev: targeted interface
 690 *      @skb: The packet.
 691 *
 692 *      For better visibility of tunnel traffic OVS needs to retrieve
 693 *      egress tunnel information for a packet. Following API allows
 694 *      user to get this info.
 695 */
 696int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 697{
 698        struct ip_tunnel_info *info;
 699
 700        if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 701                return -EINVAL;
 702
 703        info = skb_tunnel_info_unclone(skb);
 704        if (!info)
 705                return -ENOMEM;
 706        if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 707                return -EINVAL;
 708
 709        return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 710}
 711EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 712
 713/**
 714 *      __dev_get_by_name       - find a device by its name
 715 *      @net: the applicable net namespace
 716 *      @name: name to find
 717 *
 718 *      Find an interface by name. Must be called under RTNL semaphore
 719 *      or @dev_base_lock. If the name is found a pointer to the device
 720 *      is returned. If the name is not found then %NULL is returned. The
 721 *      reference counters are not incremented so the caller must be
 722 *      careful with locks.
 723 */
 724
 725struct net_device *__dev_get_by_name(struct net *net, const char *name)
 726{
 727        struct net_device *dev;
 728        struct hlist_head *head = dev_name_hash(net, name);
 729
 730        hlist_for_each_entry(dev, head, name_hlist)
 731                if (!strncmp(dev->name, name, IFNAMSIZ))
 732                        return dev;
 733
 734        return NULL;
 735}
 736EXPORT_SYMBOL(__dev_get_by_name);
 737
 738/**
 739 *      dev_get_by_name_rcu     - find a device by its name
 740 *      @net: the applicable net namespace
 741 *      @name: name to find
 742 *
 743 *      Find an interface by name.
 744 *      If the name is found a pointer to the device is returned.
 745 *      If the name is not found then %NULL is returned.
 746 *      The reference counters are not incremented so the caller must be
 747 *      careful with locks. The caller must hold RCU lock.
 748 */
 749
 750struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 751{
 752        struct net_device *dev;
 753        struct hlist_head *head = dev_name_hash(net, name);
 754
 755        hlist_for_each_entry_rcu(dev, head, name_hlist)
 756                if (!strncmp(dev->name, name, IFNAMSIZ))
 757                        return dev;
 758
 759        return NULL;
 760}
 761EXPORT_SYMBOL(dev_get_by_name_rcu);
 762
 763/**
 764 *      dev_get_by_name         - find a device by its name
 765 *      @net: the applicable net namespace
 766 *      @name: name to find
 767 *
 768 *      Find an interface by name. This can be called from any
 769 *      context and does its own locking. The returned handle has
 770 *      the usage count incremented and the caller must use dev_put() to
 771 *      release it when it is no longer needed. %NULL is returned if no
 772 *      matching device is found.
 773 */
 774
 775struct net_device *dev_get_by_name(struct net *net, const char *name)
 776{
 777        struct net_device *dev;
 778
 779        rcu_read_lock();
 780        dev = dev_get_by_name_rcu(net, name);
 781        if (dev)
 782                dev_hold(dev);
 783        rcu_read_unlock();
 784        return dev;
 785}
 786EXPORT_SYMBOL(dev_get_by_name);
 787
 788/**
 789 *      __dev_get_by_index - find a device by its ifindex
 790 *      @net: the applicable net namespace
 791 *      @ifindex: index of device
 792 *
 793 *      Search for an interface by index. Returns %NULL if the device
 794 *      is not found or a pointer to the device. The device has not
 795 *      had its reference counter increased so the caller must be careful
 796 *      about locking. The caller must hold either the RTNL semaphore
 797 *      or @dev_base_lock.
 798 */
 799
 800struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 801{
 802        struct net_device *dev;
 803        struct hlist_head *head = dev_index_hash(net, ifindex);
 804
 805        hlist_for_each_entry(dev, head, index_hlist)
 806                if (dev->ifindex == ifindex)
 807                        return dev;
 808
 809        return NULL;
 810}
 811EXPORT_SYMBOL(__dev_get_by_index);
 812
 813/**
 814 *      dev_get_by_index_rcu - find a device by its ifindex
 815 *      @net: the applicable net namespace
 816 *      @ifindex: index of device
 817 *
 818 *      Search for an interface by index. Returns %NULL if the device
 819 *      is not found or a pointer to the device. The device has not
 820 *      had its reference counter increased so the caller must be careful
 821 *      about locking. The caller must hold RCU lock.
 822 */
 823
 824struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 825{
 826        struct net_device *dev;
 827        struct hlist_head *head = dev_index_hash(net, ifindex);
 828
 829        hlist_for_each_entry_rcu(dev, head, index_hlist)
 830                if (dev->ifindex == ifindex)
 831                        return dev;
 832
 833        return NULL;
 834}
 835EXPORT_SYMBOL(dev_get_by_index_rcu);
 836
 837
 838/**
 839 *      dev_get_by_index - find a device by its ifindex
 840 *      @net: the applicable net namespace
 841 *      @ifindex: index of device
 842 *
 843 *      Search for an interface by index. Returns NULL if the device
 844 *      is not found or a pointer to the device. The device returned has
 845 *      had a reference added and the pointer is safe until the user calls
 846 *      dev_put to indicate they have finished with it.
 847 */
 848
 849struct net_device *dev_get_by_index(struct net *net, int ifindex)
 850{
 851        struct net_device *dev;
 852
 853        rcu_read_lock();
 854        dev = dev_get_by_index_rcu(net, ifindex);
 855        if (dev)
 856                dev_hold(dev);
 857        rcu_read_unlock();
 858        return dev;
 859}
 860EXPORT_SYMBOL(dev_get_by_index);
 861
 862/**
 863 *      netdev_get_name - get a netdevice name, knowing its ifindex.
 864 *      @net: network namespace
 865 *      @name: a pointer to the buffer where the name will be stored.
 866 *      @ifindex: the ifindex of the interface to get the name from.
 867 *
 868 *      The use of raw_seqcount_begin() and cond_resched() before
 869 *      retrying is required as we want to give the writers a chance
 870 *      to complete when CONFIG_PREEMPT is not set.
 871 */
 872int netdev_get_name(struct net *net, char *name, int ifindex)
 873{
 874        struct net_device *dev;
 875        unsigned int seq;
 876
 877retry:
 878        seq = raw_seqcount_begin(&devnet_rename_seq);
 879        rcu_read_lock();
 880        dev = dev_get_by_index_rcu(net, ifindex);
 881        if (!dev) {
 882                rcu_read_unlock();
 883                return -ENODEV;
 884        }
 885
 886        strcpy(name, dev->name);
 887        rcu_read_unlock();
 888        if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 889                cond_resched();
 890                goto retry;
 891        }
 892
 893        return 0;
 894}
 895
 896/**
 897 *      dev_getbyhwaddr_rcu - find a device by its hardware address
 898 *      @net: the applicable net namespace
 899 *      @type: media type of device
 900 *      @ha: hardware address
 901 *
 902 *      Search for an interface by MAC address. Returns NULL if the device
 903 *      is not found or a pointer to the device.
 904 *      The caller must hold RCU or RTNL.
 905 *      The returned device has not had its ref count increased
 906 *      and the caller must therefore be careful about locking
 907 *
 908 */
 909
 910struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 911                                       const char *ha)
 912{
 913        struct net_device *dev;
 914
 915        for_each_netdev_rcu(net, dev)
 916                if (dev->type == type &&
 917                    !memcmp(dev->dev_addr, ha, dev->addr_len))
 918                        return dev;
 919
 920        return NULL;
 921}
 922EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 923
 924struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 925{
 926        struct net_device *dev;
 927
 928        ASSERT_RTNL();
 929        for_each_netdev(net, dev)
 930                if (dev->type == type)
 931                        return dev;
 932
 933        return NULL;
 934}
 935EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 936
 937struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 938{
 939        struct net_device *dev, *ret = NULL;
 940
 941        rcu_read_lock();
 942        for_each_netdev_rcu(net, dev)
 943                if (dev->type == type) {
 944                        dev_hold(dev);
 945                        ret = dev;
 946                        break;
 947                }
 948        rcu_read_unlock();
 949        return ret;
 950}
 951EXPORT_SYMBOL(dev_getfirstbyhwtype);
 952
 953/**
 954 *      __dev_get_by_flags - find any device with given flags
 955 *      @net: the applicable net namespace
 956 *      @if_flags: IFF_* values
 957 *      @mask: bitmask of bits in if_flags to check
 958 *
 959 *      Search for any interface with the given flags. Returns NULL if a device
 960 *      is not found or a pointer to the device. Must be called inside
 961 *      rtnl_lock(), and result refcount is unchanged.
 962 */
 963
 964struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 965                                      unsigned short mask)
 966{
 967        struct net_device *dev, *ret;
 968
 969        ASSERT_RTNL();
 970
 971        ret = NULL;
 972        for_each_netdev(net, dev) {
 973                if (((dev->flags ^ if_flags) & mask) == 0) {
 974                        ret = dev;
 975                        break;
 976                }
 977        }
 978        return ret;
 979}
 980EXPORT_SYMBOL(__dev_get_by_flags);
 981
 982/**
 983 *      dev_valid_name - check if name is okay for network device
 984 *      @name: name string
 985 *
 986 *      Network device names need to be valid file names to
 987 *      to allow sysfs to work.  We also disallow any kind of
 988 *      whitespace.
 989 */
 990bool dev_valid_name(const char *name)
 991{
 992        if (*name == '\0')
 993                return false;
 994        if (strlen(name) >= IFNAMSIZ)
 995                return false;
 996        if (!strcmp(name, ".") || !strcmp(name, ".."))
 997                return false;
 998
 999        while (*name) {
1000                if (*name == '/' || *name == ':' || isspace(*name))
1001                        return false;
1002                name++;
1003        }
1004        return true;
1005}
1006EXPORT_SYMBOL(dev_valid_name);
1007
1008/**
1009 *      __dev_alloc_name - allocate a name for a device
1010 *      @net: network namespace to allocate the device name in
1011 *      @name: name format string
1012 *      @buf:  scratch buffer and result name string
1013 *
1014 *      Passed a format string - eg "lt%d" it will try and find a suitable
1015 *      id. It scans list of devices to build up a free map, then chooses
1016 *      the first empty slot. The caller must hold the dev_base or rtnl lock
1017 *      while allocating the name and adding the device in order to avoid
1018 *      duplicates.
1019 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1020 *      Returns the number of the unit assigned or a negative errno code.
1021 */
1022
1023static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1024{
1025        int i = 0;
1026        const char *p;
1027        const int max_netdevices = 8*PAGE_SIZE;
1028        unsigned long *inuse;
1029        struct net_device *d;
1030
1031        p = strnchr(name, IFNAMSIZ-1, '%');
1032        if (p) {
1033                /*
1034                 * Verify the string as this thing may have come from
1035                 * the user.  There must be either one "%d" and no other "%"
1036                 * characters.
1037                 */
1038                if (p[1] != 'd' || strchr(p + 2, '%'))
1039                        return -EINVAL;
1040
1041                /* Use one page as a bit array of possible slots */
1042                inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1043                if (!inuse)
1044                        return -ENOMEM;
1045
1046                for_each_netdev(net, d) {
1047                        if (!sscanf(d->name, name, &i))
1048                                continue;
1049                        if (i < 0 || i >= max_netdevices)
1050                                continue;
1051
1052                        /*  avoid cases where sscanf is not exact inverse of printf */
1053                        snprintf(buf, IFNAMSIZ, name, i);
1054                        if (!strncmp(buf, d->name, IFNAMSIZ))
1055                                set_bit(i, inuse);
1056                }
1057
1058                i = find_first_zero_bit(inuse, max_netdevices);
1059                free_page((unsigned long) inuse);
1060        }
1061
1062        if (buf != name)
1063                snprintf(buf, IFNAMSIZ, name, i);
1064        if (!__dev_get_by_name(net, buf))
1065                return i;
1066
1067        /* It is possible to run out of possible slots
1068         * when the name is long and there isn't enough space left
1069         * for the digits, or if all bits are used.
1070         */
1071        return -ENFILE;
1072}
1073
1074/**
1075 *      dev_alloc_name - allocate a name for a device
1076 *      @dev: device
1077 *      @name: name format string
1078 *
1079 *      Passed a format string - eg "lt%d" it will try and find a suitable
1080 *      id. It scans list of devices to build up a free map, then chooses
1081 *      the first empty slot. The caller must hold the dev_base or rtnl lock
1082 *      while allocating the name and adding the device in order to avoid
1083 *      duplicates.
1084 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1085 *      Returns the number of the unit assigned or a negative errno code.
1086 */
1087
1088int dev_alloc_name(struct net_device *dev, const char *name)
1089{
1090        char buf[IFNAMSIZ];
1091        struct net *net;
1092        int ret;
1093
1094        BUG_ON(!dev_net(dev));
1095        net = dev_net(dev);
1096        ret = __dev_alloc_name(net, name, buf);
1097        if (ret >= 0)
1098                strlcpy(dev->name, buf, IFNAMSIZ);
1099        return ret;
1100}
1101EXPORT_SYMBOL(dev_alloc_name);
1102
1103static int dev_alloc_name_ns(struct net *net,
1104                             struct net_device *dev,
1105                             const char *name)
1106{
1107        char buf[IFNAMSIZ];
1108        int ret;
1109
1110        ret = __dev_alloc_name(net, name, buf);
1111        if (ret >= 0)
1112                strlcpy(dev->name, buf, IFNAMSIZ);
1113        return ret;
1114}
1115
1116static int dev_get_valid_name(struct net *net,
1117                              struct net_device *dev,
1118                              const char *name)
1119{
1120        BUG_ON(!net);
1121
1122        if (!dev_valid_name(name))
1123                return -EINVAL;
1124
1125        if (strchr(name, '%'))
1126                return dev_alloc_name_ns(net, dev, name);
1127        else if (__dev_get_by_name(net, name))
1128                return -EEXIST;
1129        else if (dev->name != name)
1130                strlcpy(dev->name, name, IFNAMSIZ);
1131
1132        return 0;
1133}
1134
1135/**
1136 *      dev_change_name - change name of a device
1137 *      @dev: device
1138 *      @newname: name (or format string) must be at least IFNAMSIZ
1139 *
1140 *      Change name of a device, can pass format strings "eth%d".
1141 *      for wildcarding.
1142 */
1143int dev_change_name(struct net_device *dev, const char *newname)
1144{
1145        unsigned char old_assign_type;
1146        char oldname[IFNAMSIZ];
1147        int err = 0;
1148        int ret;
1149        struct net *net;
1150
1151        ASSERT_RTNL();
1152        BUG_ON(!dev_net(dev));
1153
1154        net = dev_net(dev);
1155        if (dev->flags & IFF_UP)
1156                return -EBUSY;
1157
1158        write_seqcount_begin(&devnet_rename_seq);
1159
1160        if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1161                write_seqcount_end(&devnet_rename_seq);
1162                return 0;
1163        }
1164
1165        memcpy(oldname, dev->name, IFNAMSIZ);
1166
1167        err = dev_get_valid_name(net, dev, newname);
1168        if (err < 0) {
1169                write_seqcount_end(&devnet_rename_seq);
1170                return err;
1171        }
1172
1173        if (oldname[0] && !strchr(oldname, '%'))
1174                netdev_info(dev, "renamed from %s\n", oldname);
1175
1176        old_assign_type = dev->name_assign_type;
1177        dev->name_assign_type = NET_NAME_RENAMED;
1178
1179rollback:
1180        ret = device_rename(&dev->dev, dev->name);
1181        if (ret) {
1182                memcpy(dev->name, oldname, IFNAMSIZ);
1183                dev->name_assign_type = old_assign_type;
1184                write_seqcount_end(&devnet_rename_seq);
1185                return ret;
1186        }
1187
1188        write_seqcount_end(&devnet_rename_seq);
1189
1190        netdev_adjacent_rename_links(dev, oldname);
1191
1192        write_lock_bh(&dev_base_lock);
1193        hlist_del_rcu(&dev->name_hlist);
1194        write_unlock_bh(&dev_base_lock);
1195
1196        synchronize_rcu();
1197
1198        write_lock_bh(&dev_base_lock);
1199        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1200        write_unlock_bh(&dev_base_lock);
1201
1202        ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1203        ret = notifier_to_errno(ret);
1204
1205        if (ret) {
1206                /* err >= 0 after dev_alloc_name() or stores the first errno */
1207                if (err >= 0) {
1208                        err = ret;
1209                        write_seqcount_begin(&devnet_rename_seq);
1210                        memcpy(dev->name, oldname, IFNAMSIZ);
1211                        memcpy(oldname, newname, IFNAMSIZ);
1212                        dev->name_assign_type = old_assign_type;
1213                        old_assign_type = NET_NAME_RENAMED;
1214                        goto rollback;
1215                } else {
1216                        pr_err("%s: name change rollback failed: %d\n",
1217                               dev->name, ret);
1218                }
1219        }
1220
1221        return err;
1222}
1223
1224/**
1225 *      dev_set_alias - change ifalias of a device
1226 *      @dev: device
1227 *      @alias: name up to IFALIASZ
1228 *      @len: limit of bytes to copy from info
1229 *
1230 *      Set ifalias for a device,
1231 */
1232int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1233{
1234        char *new_ifalias;
1235
1236        ASSERT_RTNL();
1237
1238        if (len >= IFALIASZ)
1239                return -EINVAL;
1240
1241        if (!len) {
1242                kfree(dev->ifalias);
1243                dev->ifalias = NULL;
1244                return 0;
1245        }
1246
1247        new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1248        if (!new_ifalias)
1249                return -ENOMEM;
1250        dev->ifalias = new_ifalias;
1251
1252        strlcpy(dev->ifalias, alias, len+1);
1253        return len;
1254}
1255
1256
1257/**
1258 *      netdev_features_change - device changes features
1259 *      @dev: device to cause notification
1260 *
1261 *      Called to indicate a device has changed features.
1262 */
1263void netdev_features_change(struct net_device *dev)
1264{
1265        call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1266}
1267EXPORT_SYMBOL(netdev_features_change);
1268
1269/**
1270 *      netdev_state_change - device changes state
1271 *      @dev: device to cause notification
1272 *
1273 *      Called to indicate a device has changed state. This function calls
1274 *      the notifier chains for netdev_chain and sends a NEWLINK message
1275 *      to the routing socket.
1276 */
1277void netdev_state_change(struct net_device *dev)
1278{
1279        if (dev->flags & IFF_UP) {
1280                struct netdev_notifier_change_info change_info;
1281
1282                change_info.flags_changed = 0;
1283                call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1284                                              &change_info.info);
1285                rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1286        }
1287}
1288EXPORT_SYMBOL(netdev_state_change);
1289
1290/**
1291 *      netdev_notify_peers - notify network peers about existence of @dev
1292 *      @dev: network device
1293 *
1294 * Generate traffic such that interested network peers are aware of
1295 * @dev, such as by generating a gratuitous ARP. This may be used when
1296 * a device wants to inform the rest of the network about some sort of
1297 * reconfiguration such as a failover event or virtual machine
1298 * migration.
1299 */
1300void netdev_notify_peers(struct net_device *dev)
1301{
1302        rtnl_lock();
1303        call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1304        rtnl_unlock();
1305}
1306EXPORT_SYMBOL(netdev_notify_peers);
1307
1308static int __dev_open(struct net_device *dev)
1309{
1310        const struct net_device_ops *ops = dev->netdev_ops;
1311        int ret;
1312
1313        ASSERT_RTNL();
1314
1315        if (!netif_device_present(dev))
1316                return -ENODEV;
1317
1318        /* Block netpoll from trying to do any rx path servicing.
1319         * If we don't do this there is a chance ndo_poll_controller
1320         * or ndo_poll may be running while we open the device
1321         */
1322        netpoll_poll_disable(dev);
1323
1324        ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1325        ret = notifier_to_errno(ret);
1326        if (ret)
1327                return ret;
1328
1329        set_bit(__LINK_STATE_START, &dev->state);
1330
1331        if (ops->ndo_validate_addr)
1332                ret = ops->ndo_validate_addr(dev);
1333
1334        if (!ret && ops->ndo_open)
1335                ret = ops->ndo_open(dev);
1336
1337        netpoll_poll_enable(dev);
1338
1339        if (ret)
1340                clear_bit(__LINK_STATE_START, &dev->state);
1341        else {
1342                dev->flags |= IFF_UP;
1343                dev_set_rx_mode(dev);
1344                dev_activate(dev);
1345                add_device_randomness(dev->dev_addr, dev->addr_len);
1346        }
1347
1348        return ret;
1349}
1350
1351/**
1352 *      dev_open        - prepare an interface for use.
1353 *      @dev:   device to open
1354 *
1355 *      Takes a device from down to up state. The device's private open
1356 *      function is invoked and then the multicast lists are loaded. Finally
1357 *      the device is moved into the up state and a %NETDEV_UP message is
1358 *      sent to the netdev notifier chain.
1359 *
1360 *      Calling this function on an active interface is a nop. On a failure
1361 *      a negative errno code is returned.
1362 */
1363int dev_open(struct net_device *dev)
1364{
1365        int ret;
1366
1367        if (dev->flags & IFF_UP)
1368                return 0;
1369
1370        ret = __dev_open(dev);
1371        if (ret < 0)
1372                return ret;
1373
1374        rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1375        call_netdevice_notifiers(NETDEV_UP, dev);
1376
1377        return ret;
1378}
1379EXPORT_SYMBOL(dev_open);
1380
1381static int __dev_close_many(struct list_head *head)
1382{
1383        struct net_device *dev;
1384
1385        ASSERT_RTNL();
1386        might_sleep();
1387
1388        list_for_each_entry(dev, head, close_list) {
1389                /* Temporarily disable netpoll until the interface is down */
1390                netpoll_poll_disable(dev);
1391
1392                call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1393
1394                clear_bit(__LINK_STATE_START, &dev->state);
1395
1396                /* Synchronize to scheduled poll. We cannot touch poll list, it
1397                 * can be even on different cpu. So just clear netif_running().
1398                 *
1399                 * dev->stop() will invoke napi_disable() on all of it's
1400                 * napi_struct instances on this device.
1401                 */
1402                smp_mb__after_atomic(); /* Commit netif_running(). */
1403        }
1404
1405        dev_deactivate_many(head);
1406
1407        list_for_each_entry(dev, head, close_list) {
1408                const struct net_device_ops *ops = dev->netdev_ops;
1409
1410                /*
1411                 *      Call the device specific close. This cannot fail.
1412                 *      Only if device is UP
1413                 *
1414                 *      We allow it to be called even after a DETACH hot-plug
1415                 *      event.
1416                 */
1417                if (ops->ndo_stop)
1418                        ops->ndo_stop(dev);
1419
1420                dev->flags &= ~IFF_UP;
1421                netpoll_poll_enable(dev);
1422        }
1423
1424        return 0;
1425}
1426
1427static int __dev_close(struct net_device *dev)
1428{
1429        int retval;
1430        LIST_HEAD(single);
1431
1432        list_add(&dev->close_list, &single);
1433        retval = __dev_close_many(&single);
1434        list_del(&single);
1435
1436        return retval;
1437}
1438
1439int dev_close_many(struct list_head *head, bool unlink)
1440{
1441        struct net_device *dev, *tmp;
1442
1443        /* Remove the devices that don't need to be closed */
1444        list_for_each_entry_safe(dev, tmp, head, close_list)
1445                if (!(dev->flags & IFF_UP))
1446                        list_del_init(&dev->close_list);
1447
1448        __dev_close_many(head);
1449
1450        list_for_each_entry_safe(dev, tmp, head, close_list) {
1451                rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1452                call_netdevice_notifiers(NETDEV_DOWN, dev);
1453                if (unlink)
1454                        list_del_init(&dev->close_list);
1455        }
1456
1457        return 0;
1458}
1459EXPORT_SYMBOL(dev_close_many);
1460
1461/**
1462 *      dev_close - shutdown an interface.
1463 *      @dev: device to shutdown
1464 *
1465 *      This function moves an active device into down state. A
1466 *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1467 *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1468 *      chain.
1469 */
1470int dev_close(struct net_device *dev)
1471{
1472        if (dev->flags & IFF_UP) {
1473                LIST_HEAD(single);
1474
1475                list_add(&dev->close_list, &single);
1476                dev_close_many(&single, true);
1477                list_del(&single);
1478        }
1479        return 0;
1480}
1481EXPORT_SYMBOL(dev_close);
1482
1483
1484/**
1485 *      dev_disable_lro - disable Large Receive Offload on a device
1486 *      @dev: device
1487 *
1488 *      Disable Large Receive Offload (LRO) on a net device.  Must be
1489 *      called under RTNL.  This is needed if received packets may be
1490 *      forwarded to another interface.
1491 */
1492void dev_disable_lro(struct net_device *dev)
1493{
1494        struct net_device *lower_dev;
1495        struct list_head *iter;
1496
1497        dev->wanted_features &= ~NETIF_F_LRO;
1498        netdev_update_features(dev);
1499
1500        if (unlikely(dev->features & NETIF_F_LRO))
1501                netdev_WARN(dev, "failed to disable LRO!\n");
1502
1503        netdev_for_each_lower_dev(dev, lower_dev, iter)
1504                dev_disable_lro(lower_dev);
1505}
1506EXPORT_SYMBOL(dev_disable_lro);
1507
1508static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1509                                   struct net_device *dev)
1510{
1511        struct netdev_notifier_info info;
1512
1513        netdev_notifier_info_init(&info, dev);
1514        return nb->notifier_call(nb, val, &info);
1515}
1516
1517static int dev_boot_phase = 1;
1518
1519/**
1520 *      register_netdevice_notifier - register a network notifier block
1521 *      @nb: notifier
1522 *
1523 *      Register a notifier to be called when network device events occur.
1524 *      The notifier passed is linked into the kernel structures and must
1525 *      not be reused until it has been unregistered. A negative errno code
1526 *      is returned on a failure.
1527 *
1528 *      When registered all registration and up events are replayed
1529 *      to the new notifier to allow device to have a race free
1530 *      view of the network device list.
1531 */
1532
1533int register_netdevice_notifier(struct notifier_block *nb)
1534{
1535        struct net_device *dev;
1536        struct net_device *last;
1537        struct net *net;
1538        int err;
1539
1540        rtnl_lock();
1541        err = raw_notifier_chain_register(&netdev_chain, nb);
1542        if (err)
1543                goto unlock;
1544        if (dev_boot_phase)
1545                goto unlock;
1546        for_each_net(net) {
1547                for_each_netdev(net, dev) {
1548                        err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1549                        err = notifier_to_errno(err);
1550                        if (err)
1551                                goto rollback;
1552
1553                        if (!(dev->flags & IFF_UP))
1554                                continue;
1555
1556                        call_netdevice_notifier(nb, NETDEV_UP, dev);
1557                }
1558        }
1559
1560unlock:
1561        rtnl_unlock();
1562        return err;
1563
1564rollback:
1565        last = dev;
1566        for_each_net(net) {
1567                for_each_netdev(net, dev) {
1568                        if (dev == last)
1569                                goto outroll;
1570
1571                        if (dev->flags & IFF_UP) {
1572                                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1573                                                        dev);
1574                                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1575                        }
1576                        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1577                }
1578        }
1579
1580outroll:
1581        raw_notifier_chain_unregister(&netdev_chain, nb);
1582        goto unlock;
1583}
1584EXPORT_SYMBOL(register_netdevice_notifier);
1585
1586/**
1587 *      unregister_netdevice_notifier - unregister a network notifier block
1588 *      @nb: notifier
1589 *
1590 *      Unregister a notifier previously registered by
1591 *      register_netdevice_notifier(). The notifier is unlinked into the
1592 *      kernel structures and may then be reused. A negative errno code
1593 *      is returned on a failure.
1594 *
1595 *      After unregistering unregister and down device events are synthesized
1596 *      for all devices on the device list to the removed notifier to remove
1597 *      the need for special case cleanup code.
1598 */
1599
1600int unregister_netdevice_notifier(struct notifier_block *nb)
1601{
1602        struct net_device *dev;
1603        struct net *net;
1604        int err;
1605
1606        rtnl_lock();
1607        err = raw_notifier_chain_unregister(&netdev_chain, nb);
1608        if (err)
1609                goto unlock;
1610
1611        for_each_net(net) {
1612                for_each_netdev(net, dev) {
1613                        if (dev->flags & IFF_UP) {
1614                                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1615                                                        dev);
1616                                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1617                        }
1618                        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1619                }
1620        }
1621unlock:
1622        rtnl_unlock();
1623        return err;
1624}
1625EXPORT_SYMBOL(unregister_netdevice_notifier);
1626
1627/**
1628 *      call_netdevice_notifiers_info - call all network notifier blocks
1629 *      @val: value passed unmodified to notifier function
1630 *      @dev: net_device pointer passed unmodified to notifier function
1631 *      @info: notifier information data
1632 *
1633 *      Call all network notifier blocks.  Parameters and return value
1634 *      are as for raw_notifier_call_chain().
1635 */
1636
1637static int call_netdevice_notifiers_info(unsigned long val,
1638                                         struct net_device *dev,
1639                                         struct netdev_notifier_info *info)
1640{
1641        ASSERT_RTNL();
1642        netdev_notifier_info_init(info, dev);
1643        return raw_notifier_call_chain(&netdev_chain, val, info);
1644}
1645
1646/**
1647 *      call_netdevice_notifiers - call all network notifier blocks
1648 *      @val: value passed unmodified to notifier function
1649 *      @dev: net_device pointer passed unmodified to notifier function
1650 *
1651 *      Call all network notifier blocks.  Parameters and return value
1652 *      are as for raw_notifier_call_chain().
1653 */
1654
1655int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1656{
1657        struct netdev_notifier_info info;
1658
1659        return call_netdevice_notifiers_info(val, dev, &info);
1660}
1661EXPORT_SYMBOL(call_netdevice_notifiers);
1662
1663#ifdef CONFIG_NET_INGRESS
1664static struct static_key ingress_needed __read_mostly;
1665
1666void net_inc_ingress_queue(void)
1667{
1668        static_key_slow_inc(&ingress_needed);
1669}
1670EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1671
1672void net_dec_ingress_queue(void)
1673{
1674        static_key_slow_dec(&ingress_needed);
1675}
1676EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1677#endif
1678
1679#ifdef CONFIG_NET_EGRESS
1680static struct static_key egress_needed __read_mostly;
1681
1682void net_inc_egress_queue(void)
1683{
1684        static_key_slow_inc(&egress_needed);
1685}
1686EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1687
1688void net_dec_egress_queue(void)
1689{
1690        static_key_slow_dec(&egress_needed);
1691}
1692EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1693#endif
1694
1695static struct static_key netstamp_needed __read_mostly;
1696#ifdef HAVE_JUMP_LABEL
1697/* We are not allowed to call static_key_slow_dec() from irq context
1698 * If net_disable_timestamp() is called from irq context, defer the
1699 * static_key_slow_dec() calls.
1700 */
1701static atomic_t netstamp_needed_deferred;
1702#endif
1703
1704void net_enable_timestamp(void)
1705{
1706#ifdef HAVE_JUMP_LABEL
1707        int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1708
1709        if (deferred) {
1710                while (--deferred)
1711                        static_key_slow_dec(&netstamp_needed);
1712                return;
1713        }
1714#endif
1715        static_key_slow_inc(&netstamp_needed);
1716}
1717EXPORT_SYMBOL(net_enable_timestamp);
1718
1719void net_disable_timestamp(void)
1720{
1721#ifdef HAVE_JUMP_LABEL
1722        if (in_interrupt()) {
1723                atomic_inc(&netstamp_needed_deferred);
1724                return;
1725        }
1726#endif
1727        static_key_slow_dec(&netstamp_needed);
1728}
1729EXPORT_SYMBOL(net_disable_timestamp);
1730
1731static inline void net_timestamp_set(struct sk_buff *skb)
1732{
1733        skb->tstamp.tv64 = 0;
1734        if (static_key_false(&netstamp_needed))
1735                __net_timestamp(skb);
1736}
1737
1738#define net_timestamp_check(COND, SKB)                  \
1739        if (static_key_false(&netstamp_needed)) {               \
1740                if ((COND) && !(SKB)->tstamp.tv64)      \
1741                        __net_timestamp(SKB);           \
1742        }                                               \
1743
1744bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1745{
1746        unsigned int len;
1747
1748        if (!(dev->flags & IFF_UP))
1749                return false;
1750
1751        len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1752        if (skb->len <= len)
1753                return true;
1754
1755        /* if TSO is enabled, we don't care about the length as the packet
1756         * could be forwarded without being segmented before
1757         */
1758        if (skb_is_gso(skb))
1759                return true;
1760
1761        return false;
1762}
1763EXPORT_SYMBOL_GPL(is_skb_forwardable);
1764
1765int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1766{
1767        if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1768            unlikely(!is_skb_forwardable(dev, skb))) {
1769                atomic_long_inc(&dev->rx_dropped);
1770                kfree_skb(skb);
1771                return NET_RX_DROP;
1772        }
1773
1774        skb_scrub_packet(skb, true);
1775        skb->priority = 0;
1776        skb->protocol = eth_type_trans(skb, dev);
1777        skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1778
1779        return 0;
1780}
1781EXPORT_SYMBOL_GPL(__dev_forward_skb);
1782
1783/**
1784 * dev_forward_skb - loopback an skb to another netif
1785 *
1786 * @dev: destination network device
1787 * @skb: buffer to forward
1788 *
1789 * return values:
1790 *      NET_RX_SUCCESS  (no congestion)
1791 *      NET_RX_DROP     (packet was dropped, but freed)
1792 *
1793 * dev_forward_skb can be used for injecting an skb from the
1794 * start_xmit function of one device into the receive queue
1795 * of another device.
1796 *
1797 * The receiving device may be in another namespace, so
1798 * we have to clear all information in the skb that could
1799 * impact namespace isolation.
1800 */
1801int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1802{
1803        return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1804}
1805EXPORT_SYMBOL_GPL(dev_forward_skb);
1806
1807static inline int deliver_skb(struct sk_buff *skb,
1808                              struct packet_type *pt_prev,
1809                              struct net_device *orig_dev)
1810{
1811        if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1812                return -ENOMEM;
1813        atomic_inc(&skb->users);
1814        return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1815}
1816
1817static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1818                                          struct packet_type **pt,
1819                                          struct net_device *orig_dev,
1820                                          __be16 type,
1821                                          struct list_head *ptype_list)
1822{
1823        struct packet_type *ptype, *pt_prev = *pt;
1824
1825        list_for_each_entry_rcu(ptype, ptype_list, list) {
1826                if (ptype->type != type)
1827                        continue;
1828                if (pt_prev)
1829                        deliver_skb(skb, pt_prev, orig_dev);
1830                pt_prev = ptype;
1831        }
1832        *pt = pt_prev;
1833}
1834
1835static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1836{
1837        if (!ptype->af_packet_priv || !skb->sk)
1838                return false;
1839
1840        if (ptype->id_match)
1841                return ptype->id_match(ptype, skb->sk);
1842        else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1843                return true;
1844
1845        return false;
1846}
1847
1848/*
1849 *      Support routine. Sends outgoing frames to any network
1850 *      taps currently in use.
1851 */
1852
1853void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1854{
1855        struct packet_type *ptype;
1856        struct sk_buff *skb2 = NULL;
1857        struct packet_type *pt_prev = NULL;
1858        struct list_head *ptype_list = &ptype_all;
1859
1860        rcu_read_lock();
1861again:
1862        list_for_each_entry_rcu(ptype, ptype_list, list) {
1863                /* Never send packets back to the socket
1864                 * they originated from - MvS (miquels@drinkel.ow.org)
1865                 */
1866                if (skb_loop_sk(ptype, skb))
1867                        continue;
1868
1869                if (pt_prev) {
1870                        deliver_skb(skb2, pt_prev, skb->dev);
1871                        pt_prev = ptype;
1872                        continue;
1873                }
1874
1875                /* need to clone skb, done only once */
1876                skb2 = skb_clone(skb, GFP_ATOMIC);
1877                if (!skb2)
1878                        goto out_unlock;
1879
1880                net_timestamp_set(skb2);
1881
1882                /* skb->nh should be correctly
1883                 * set by sender, so that the second statement is
1884                 * just protection against buggy protocols.
1885                 */
1886                skb_reset_mac_header(skb2);
1887
1888                if (skb_network_header(skb2) < skb2->data ||
1889                    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1890                        net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1891                                             ntohs(skb2->protocol),
1892                                             dev->name);
1893                        skb_reset_network_header(skb2);
1894                }
1895
1896                skb2->transport_header = skb2->network_header;
1897                skb2->pkt_type = PACKET_OUTGOING;
1898                pt_prev = ptype;
1899        }
1900
1901        if (ptype_list == &ptype_all) {
1902                ptype_list = &dev->ptype_all;
1903                goto again;
1904        }
1905out_unlock:
1906        if (pt_prev)
1907                pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1908        rcu_read_unlock();
1909}
1910EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1911
1912/**
1913 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1914 * @dev: Network device
1915 * @txq: number of queues available
1916 *
1917 * If real_num_tx_queues is changed the tc mappings may no longer be
1918 * valid. To resolve this verify the tc mapping remains valid and if
1919 * not NULL the mapping. With no priorities mapping to this
1920 * offset/count pair it will no longer be used. In the worst case TC0
1921 * is invalid nothing can be done so disable priority mappings. If is
1922 * expected that drivers will fix this mapping if they can before
1923 * calling netif_set_real_num_tx_queues.
1924 */
1925static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1926{
1927        int i;
1928        struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1929
1930        /* If TC0 is invalidated disable TC mapping */
1931        if (tc->offset + tc->count > txq) {
1932                pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1933                dev->num_tc = 0;
1934                return;
1935        }
1936
1937        /* Invalidated prio to tc mappings set to TC0 */
1938        for (i = 1; i < TC_BITMASK + 1; i++) {
1939                int q = netdev_get_prio_tc_map(dev, i);
1940
1941                tc = &dev->tc_to_txq[q];
1942                if (tc->offset + tc->count > txq) {
1943                        pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1944                                i, q);
1945                        netdev_set_prio_tc_map(dev, i, 0);
1946                }
1947        }
1948}
1949
1950#ifdef CONFIG_XPS
1951static DEFINE_MUTEX(xps_map_mutex);
1952#define xmap_dereference(P)             \
1953        rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1954
1955static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1956                                        int cpu, u16 index)
1957{
1958        struct xps_map *map = NULL;
1959        int pos;
1960
1961        if (dev_maps)
1962                map = xmap_dereference(dev_maps->cpu_map[cpu]);
1963
1964        for (pos = 0; map && pos < map->len; pos++) {
1965                if (map->queues[pos] == index) {
1966                        if (map->len > 1) {
1967                                map->queues[pos] = map->queues[--map->len];
1968                        } else {
1969                                RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1970                                kfree_rcu(map, rcu);
1971                                map = NULL;
1972                        }
1973                        break;
1974                }
1975        }
1976
1977        return map;
1978}
1979
1980static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1981{
1982        struct xps_dev_maps *dev_maps;
1983        int cpu, i;
1984        bool active = false;
1985
1986        mutex_lock(&xps_map_mutex);
1987        dev_maps = xmap_dereference(dev->xps_maps);
1988
1989        if (!dev_maps)
1990                goto out_no_maps;
1991
1992        for_each_possible_cpu(cpu) {
1993                for (i = index; i < dev->num_tx_queues; i++) {
1994                        if (!remove_xps_queue(dev_maps, cpu, i))
1995                                break;
1996                }
1997                if (i == dev->num_tx_queues)
1998                        active = true;
1999        }
2000
2001        if (!active) {
2002                RCU_INIT_POINTER(dev->xps_maps, NULL);
2003                kfree_rcu(dev_maps, rcu);
2004        }
2005
2006        for (i = index; i < dev->num_tx_queues; i++)
2007                netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2008                                             NUMA_NO_NODE);
2009
2010out_no_maps:
2011        mutex_unlock(&xps_map_mutex);
2012}
2013
2014static struct xps_map *expand_xps_map(struct xps_map *map,
2015                                      int cpu, u16 index)
2016{
2017        struct xps_map *new_map;
2018        int alloc_len = XPS_MIN_MAP_ALLOC;
2019        int i, pos;
2020
2021        for (pos = 0; map && pos < map->len; pos++) {
2022                if (map->queues[pos] != index)
2023                        continue;
2024                return map;
2025        }
2026
2027        /* Need to add queue to this CPU's existing map */
2028        if (map) {
2029                if (pos < map->alloc_len)
2030                        return map;
2031
2032                alloc_len = map->alloc_len * 2;
2033        }
2034
2035        /* Need to allocate new map to store queue on this CPU's map */
2036        new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2037                               cpu_to_node(cpu));
2038        if (!new_map)
2039                return NULL;
2040
2041        for (i = 0; i < pos; i++)
2042                new_map->queues[i] = map->queues[i];
2043        new_map->alloc_len = alloc_len;
2044        new_map->len = pos;
2045
2046        return new_map;
2047}
2048
2049int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2050                        u16 index)
2051{
2052        struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2053        struct xps_map *map, *new_map;
2054        int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2055        int cpu, numa_node_id = -2;
2056        bool active = false;
2057
2058        mutex_lock(&xps_map_mutex);
2059
2060        dev_maps = xmap_dereference(dev->xps_maps);
2061
2062        /* allocate memory for queue storage */
2063        for_each_online_cpu(cpu) {
2064                if (!cpumask_test_cpu(cpu, mask))
2065                        continue;
2066
2067                if (!new_dev_maps)
2068                        new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2069                if (!new_dev_maps) {
2070                        mutex_unlock(&xps_map_mutex);
2071                        return -ENOMEM;
2072                }
2073
2074                map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2075                                 NULL;
2076
2077                map = expand_xps_map(map, cpu, index);
2078                if (!map)
2079                        goto error;
2080
2081                RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2082        }
2083
2084        if (!new_dev_maps)
2085                goto out_no_new_maps;
2086
2087        for_each_possible_cpu(cpu) {
2088                if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2089                        /* add queue to CPU maps */
2090                        int pos = 0;
2091
2092                        map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2093                        while ((pos < map->len) && (map->queues[pos] != index))
2094                                pos++;
2095
2096                        if (pos == map->len)
2097                                map->queues[map->len++] = index;
2098#ifdef CONFIG_NUMA
2099                        if (numa_node_id == -2)
2100                                numa_node_id = cpu_to_node(cpu);
2101                        else if (numa_node_id != cpu_to_node(cpu))
2102                                numa_node_id = -1;
2103#endif
2104                } else if (dev_maps) {
2105                        /* fill in the new device map from the old device map */
2106                        map = xmap_dereference(dev_maps->cpu_map[cpu]);
2107                        RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2108                }
2109
2110        }
2111
2112        rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2113
2114        /* Cleanup old maps */
2115        if (dev_maps) {
2116                for_each_possible_cpu(cpu) {
2117                        new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2118                        map = xmap_dereference(dev_maps->cpu_map[cpu]);
2119                        if (map && map != new_map)
2120                                kfree_rcu(map, rcu);
2121                }
2122
2123                kfree_rcu(dev_maps, rcu);
2124        }
2125
2126        dev_maps = new_dev_maps;
2127        active = true;
2128
2129out_no_new_maps:
2130        /* update Tx queue numa node */
2131        netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2132                                     (numa_node_id >= 0) ? numa_node_id :
2133                                     NUMA_NO_NODE);
2134
2135        if (!dev_maps)
2136                goto out_no_maps;
2137
2138        /* removes queue from unused CPUs */
2139        for_each_possible_cpu(cpu) {
2140                if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2141                        continue;
2142
2143                if (remove_xps_queue(dev_maps, cpu, index))
2144                        active = true;
2145        }
2146
2147        /* free map if not active */
2148        if (!active) {
2149                RCU_INIT_POINTER(dev->xps_maps, NULL);
2150                kfree_rcu(dev_maps, rcu);
2151        }
2152
2153out_no_maps:
2154        mutex_unlock(&xps_map_mutex);
2155
2156        return 0;
2157error:
2158        /* remove any maps that we added */
2159        for_each_possible_cpu(cpu) {
2160                new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2161                map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2162                                 NULL;
2163                if (new_map && new_map != map)
2164                        kfree(new_map);
2165        }
2166
2167        mutex_unlock(&xps_map_mutex);
2168
2169        kfree(new_dev_maps);
2170        return -ENOMEM;
2171}
2172EXPORT_SYMBOL(netif_set_xps_queue);
2173
2174#endif
2175/*
2176 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2177 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2178 */
2179int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2180{
2181        int rc;
2182
2183        if (txq < 1 || txq > dev->num_tx_queues)
2184                return -EINVAL;
2185
2186        if (dev->reg_state == NETREG_REGISTERED ||
2187            dev->reg_state == NETREG_UNREGISTERING) {
2188                ASSERT_RTNL();
2189
2190                rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2191                                                  txq);
2192                if (rc)
2193                        return rc;
2194
2195                if (dev->num_tc)
2196                        netif_setup_tc(dev, txq);
2197
2198                if (txq < dev->real_num_tx_queues) {
2199                        qdisc_reset_all_tx_gt(dev, txq);
2200#ifdef CONFIG_XPS
2201                        netif_reset_xps_queues_gt(dev, txq);
2202#endif
2203                }
2204        }
2205
2206        dev->real_num_tx_queues = txq;
2207        return 0;
2208}
2209EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2210
2211#ifdef CONFIG_SYSFS
2212/**
2213 *      netif_set_real_num_rx_queues - set actual number of RX queues used
2214 *      @dev: Network device
2215 *      @rxq: Actual number of RX queues
2216 *
2217 *      This must be called either with the rtnl_lock held or before
2218 *      registration of the net device.  Returns 0 on success, or a
2219 *      negative error code.  If called before registration, it always
2220 *      succeeds.
2221 */
2222int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2223{
2224        int rc;
2225
2226        if (rxq < 1 || rxq > dev->num_rx_queues)
2227                return -EINVAL;
2228
2229        if (dev->reg_state == NETREG_REGISTERED) {
2230                ASSERT_RTNL();
2231
2232                rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2233                                                  rxq);
2234                if (rc)
2235                        return rc;
2236        }
2237
2238        dev->real_num_rx_queues = rxq;
2239        return 0;
2240}
2241EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2242#endif
2243
2244/**
2245 * netif_get_num_default_rss_queues - default number of RSS queues
2246 *
2247 * This routine should set an upper limit on the number of RSS queues
2248 * used by default by multiqueue devices.
2249 */
2250int netif_get_num_default_rss_queues(void)
2251{
2252        return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2253}
2254EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2255
2256static inline void __netif_reschedule(struct Qdisc *q)
2257{
2258        struct softnet_data *sd;
2259        unsigned long flags;
2260
2261        local_irq_save(flags);
2262        sd = this_cpu_ptr(&softnet_data);
2263        q->next_sched = NULL;
2264        *sd->output_queue_tailp = q;
2265        sd->output_queue_tailp = &q->next_sched;
2266        raise_softirq_irqoff(NET_TX_SOFTIRQ);
2267        local_irq_restore(flags);
2268}
2269
2270void __netif_schedule(struct Qdisc *q)
2271{
2272        if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2273                __netif_reschedule(q);
2274}
2275EXPORT_SYMBOL(__netif_schedule);
2276
2277struct dev_kfree_skb_cb {
2278        enum skb_free_reason reason;
2279};
2280
2281static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2282{
2283        return (struct dev_kfree_skb_cb *)skb->cb;
2284}
2285
2286void netif_schedule_queue(struct netdev_queue *txq)
2287{
2288        rcu_read_lock();
2289        if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2290                struct Qdisc *q = rcu_dereference(txq->qdisc);
2291
2292                __netif_schedule(q);
2293        }
2294        rcu_read_unlock();
2295}
2296EXPORT_SYMBOL(netif_schedule_queue);
2297
2298/**
2299 *      netif_wake_subqueue - allow sending packets on subqueue
2300 *      @dev: network device
2301 *      @queue_index: sub queue index
2302 *
2303 * Resume individual transmit queue of a device with multiple transmit queues.
2304 */
2305void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2306{
2307        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2308
2309        if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2310                struct Qdisc *q;
2311
2312                rcu_read_lock();
2313                q = rcu_dereference(txq->qdisc);
2314                __netif_schedule(q);
2315                rcu_read_unlock();
2316        }
2317}
2318EXPORT_SYMBOL(netif_wake_subqueue);
2319
2320void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2321{
2322        if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2323                struct Qdisc *q;
2324
2325                rcu_read_lock();
2326                q = rcu_dereference(dev_queue->qdisc);
2327                __netif_schedule(q);
2328                rcu_read_unlock();
2329        }
2330}
2331EXPORT_SYMBOL(netif_tx_wake_queue);
2332
2333void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2334{
2335        unsigned long flags;
2336
2337        if (likely(atomic_read(&skb->users) == 1)) {
2338                smp_rmb();
2339                atomic_set(&skb->users, 0);
2340        } else if (likely(!atomic_dec_and_test(&skb->users))) {
2341                return;
2342        }
2343        get_kfree_skb_cb(skb)->reason = reason;
2344        local_irq_save(flags);
2345        skb->next = __this_cpu_read(softnet_data.completion_queue);
2346        __this_cpu_write(softnet_data.completion_queue, skb);
2347        raise_softirq_irqoff(NET_TX_SOFTIRQ);
2348        local_irq_restore(flags);
2349}
2350EXPORT_SYMBOL(__dev_kfree_skb_irq);
2351
2352void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2353{
2354        if (in_irq() || irqs_disabled())
2355                __dev_kfree_skb_irq(skb, reason);
2356        else
2357                dev_kfree_skb(skb);
2358}
2359EXPORT_SYMBOL(__dev_kfree_skb_any);
2360
2361
2362/**
2363 * netif_device_detach - mark device as removed
2364 * @dev: network device
2365 *
2366 * Mark device as removed from system and therefore no longer available.
2367 */
2368void netif_device_detach(struct net_device *dev)
2369{
2370        if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2371            netif_running(dev)) {
2372                netif_tx_stop_all_queues(dev);
2373        }
2374}
2375EXPORT_SYMBOL(netif_device_detach);
2376
2377/**
2378 * netif_device_attach - mark device as attached
2379 * @dev: network device
2380 *
2381 * Mark device as attached from system and restart if needed.
2382 */
2383void netif_device_attach(struct net_device *dev)
2384{
2385        if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2386            netif_running(dev)) {
2387                netif_tx_wake_all_queues(dev);
2388                __netdev_watchdog_up(dev);
2389        }
2390}
2391EXPORT_SYMBOL(netif_device_attach);
2392
2393/*
2394 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2395 * to be used as a distribution range.
2396 */
2397u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2398                  unsigned int num_tx_queues)
2399{
2400        u32 hash;
2401        u16 qoffset = 0;
2402        u16 qcount = num_tx_queues;
2403
2404        if (skb_rx_queue_recorded(skb)) {
2405                hash = skb_get_rx_queue(skb);
2406                while (unlikely(hash >= num_tx_queues))
2407                        hash -= num_tx_queues;
2408                return hash;
2409        }
2410
2411        if (dev->num_tc) {
2412                u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2413                qoffset = dev->tc_to_txq[tc].offset;
2414                qcount = dev->tc_to_txq[tc].count;
2415        }
2416
2417        return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2418}
2419EXPORT_SYMBOL(__skb_tx_hash);
2420
2421static void skb_warn_bad_offload(const struct sk_buff *skb)
2422{
2423        static const netdev_features_t null_features = 0;
2424        struct net_device *dev = skb->dev;
2425        const char *name = "";
2426
2427        if (!net_ratelimit())
2428                return;
2429
2430        if (dev) {
2431                if (dev->dev.parent)
2432                        name = dev_driver_string(dev->dev.parent);
2433                else
2434                        name = netdev_name(dev);
2435        }
2436        WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2437             "gso_type=%d ip_summed=%d\n",
2438             name, dev ? &dev->features : &null_features,
2439             skb->sk ? &skb->sk->sk_route_caps : &null_features,
2440             skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2441             skb_shinfo(skb)->gso_type, skb->ip_summed);
2442}
2443
2444/*
2445 * Invalidate hardware checksum when packet is to be mangled, and
2446 * complete checksum manually on outgoing path.
2447 */
2448int skb_checksum_help(struct sk_buff *skb)
2449{
2450        __wsum csum;
2451        int ret = 0, offset;
2452
2453        if (skb->ip_summed == CHECKSUM_COMPLETE)
2454                goto out_set_summed;
2455
2456        if (unlikely(skb_shinfo(skb)->gso_size)) {
2457                skb_warn_bad_offload(skb);
2458                return -EINVAL;
2459        }
2460
2461        /* Before computing a checksum, we should make sure no frag could
2462         * be modified by an external entity : checksum could be wrong.
2463         */
2464        if (skb_has_shared_frag(skb)) {
2465                ret = __skb_linearize(skb);
2466                if (ret)
2467                        goto out;
2468        }
2469
2470        offset = skb_checksum_start_offset(skb);
2471        BUG_ON(offset >= skb_headlen(skb));
2472        csum = skb_checksum(skb, offset, skb->len - offset, 0);
2473
2474        offset += skb->csum_offset;
2475        BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2476
2477        if (skb_cloned(skb) &&
2478            !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2479                ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2480                if (ret)
2481                        goto out;
2482        }
2483
2484        *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2485out_set_summed:
2486        skb->ip_summed = CHECKSUM_NONE;
2487out:
2488        return ret;
2489}
2490EXPORT_SYMBOL(skb_checksum_help);
2491
2492/* skb_csum_offload_check - Driver helper function to determine if a device
2493 * with limited checksum offload capabilities is able to offload the checksum
2494 * for a given packet.
2495 *
2496 * Arguments:
2497 *   skb - sk_buff for the packet in question
2498 *   spec - contains the description of what device can offload
2499 *   csum_encapped - returns true if the checksum being offloaded is
2500 *            encpasulated. That is it is checksum for the transport header
2501 *            in the inner headers.
2502 *   checksum_help - when set indicates that helper function should
2503 *            call skb_checksum_help if offload checks fail
2504 *
2505 * Returns:
2506 *   true: Packet has passed the checksum checks and should be offloadable to
2507 *         the device (a driver may still need to check for additional
2508 *         restrictions of its device)
2509 *   false: Checksum is not offloadable. If checksum_help was set then
2510 *         skb_checksum_help was called to resolve checksum for non-GSO
2511 *         packets and when IP protocol is not SCTP
2512 */
2513bool __skb_csum_offload_chk(struct sk_buff *skb,
2514                            const struct skb_csum_offl_spec *spec,
2515                            bool *csum_encapped,
2516                            bool csum_help)
2517{
2518        struct iphdr *iph;
2519        struct ipv6hdr *ipv6;
2520        void *nhdr;
2521        int protocol;
2522        u8 ip_proto;
2523
2524        if (skb->protocol == htons(ETH_P_8021Q) ||
2525            skb->protocol == htons(ETH_P_8021AD)) {
2526                if (!spec->vlan_okay)
2527                        goto need_help;
2528        }
2529
2530        /* We check whether the checksum refers to a transport layer checksum in
2531         * the outermost header or an encapsulated transport layer checksum that
2532         * corresponds to the inner headers of the skb. If the checksum is for
2533         * something else in the packet we need help.
2534         */
2535        if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
2536                /* Non-encapsulated checksum */
2537                protocol = eproto_to_ipproto(vlan_get_protocol(skb));
2538                nhdr = skb_network_header(skb);
2539                *csum_encapped = false;
2540                if (spec->no_not_encapped)
2541                        goto need_help;
2542        } else if (skb->encapsulation && spec->encap_okay &&
2543                   skb_checksum_start_offset(skb) ==
2544                   skb_inner_transport_offset(skb)) {
2545                /* Encapsulated checksum */
2546                *csum_encapped = true;
2547                switch (skb->inner_protocol_type) {
2548                case ENCAP_TYPE_ETHER:
2549                        protocol = eproto_to_ipproto(skb->inner_protocol);
2550                        break;
2551                case ENCAP_TYPE_IPPROTO:
2552                        protocol = skb->inner_protocol;
2553                        break;
2554                }
2555                nhdr = skb_inner_network_header(skb);
2556        } else {
2557                goto need_help;
2558        }
2559
2560        switch (protocol) {
2561        case IPPROTO_IP:
2562                if (!spec->ipv4_okay)
2563                        goto need_help;
2564                iph = nhdr;
2565                ip_proto = iph->protocol;
2566                if (iph->ihl != 5 && !spec->ip_options_okay)
2567                        goto need_help;
2568                break;
2569        case IPPROTO_IPV6:
2570                if (!spec->ipv6_okay)
2571                        goto need_help;
2572                if (spec->no_encapped_ipv6 && *csum_encapped)
2573                        goto need_help;
2574                ipv6 = nhdr;
2575                nhdr += sizeof(*ipv6);
2576                ip_proto = ipv6->nexthdr;
2577                break;
2578        default:
2579                goto need_help;
2580        }
2581
2582ip_proto_again:
2583        switch (ip_proto) {
2584        case IPPROTO_TCP:
2585                if (!spec->tcp_okay ||
2586                    skb->csum_offset != offsetof(struct tcphdr, check))
2587                        goto need_help;
2588                break;
2589        case IPPROTO_UDP:
2590                if (!spec->udp_okay ||
2591                    skb->csum_offset != offsetof(struct udphdr, check))
2592                        goto need_help;
2593                break;
2594        case IPPROTO_SCTP:
2595                if (!spec->sctp_okay ||
2596                    skb->csum_offset != offsetof(struct sctphdr, checksum))
2597                        goto cant_help;
2598                break;
2599        case NEXTHDR_HOP:
2600        case NEXTHDR_ROUTING:
2601        case NEXTHDR_DEST: {
2602                u8 *opthdr = nhdr;
2603
2604                if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
2605                        goto need_help;
2606
2607                ip_proto = opthdr[0];
2608                nhdr += (opthdr[1] + 1) << 3;
2609
2610                goto ip_proto_again;
2611        }
2612        default:
2613                goto need_help;
2614        }
2615
2616        /* Passed the tests for offloading checksum */
2617        return true;
2618
2619need_help:
2620        if (csum_help && !skb_shinfo(skb)->gso_size)
2621                skb_checksum_help(skb);
2622cant_help:
2623        return false;
2624}
2625EXPORT_SYMBOL(__skb_csum_offload_chk);
2626
2627__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2628{
2629        __be16 type = skb->protocol;
2630
2631        /* Tunnel gso handlers can set protocol to ethernet. */
2632        if (type == htons(ETH_P_TEB)) {
2633                struct ethhdr *eth;
2634
2635                if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2636                        return 0;
2637
2638                eth = (struct ethhdr *)skb_mac_header(skb);
2639                type = eth->h_proto;
2640        }
2641
2642        return __vlan_get_protocol(skb, type, depth);
2643}
2644
2645/**
2646 *      skb_mac_gso_segment - mac layer segmentation handler.
2647 *      @skb: buffer to segment
2648 *      @features: features for the output path (see dev->features)
2649 */
2650struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2651                                    netdev_features_t features)
2652{
2653        struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2654        struct packet_offload *ptype;
2655        int vlan_depth = skb->mac_len;
2656        __be16 type = skb_network_protocol(skb, &vlan_depth);
2657
2658        if (unlikely(!type))
2659                return ERR_PTR(-EINVAL);
2660
2661        __skb_pull(skb, vlan_depth);
2662
2663        rcu_read_lock();
2664        list_for_each_entry_rcu(ptype, &offload_base, list) {
2665                if (ptype->type == type && ptype->callbacks.gso_segment) {
2666                        segs = ptype->callbacks.gso_segment(skb, features);
2667                        break;
2668                }
2669        }
2670        rcu_read_unlock();
2671
2672        __skb_push(skb, skb->data - skb_mac_header(skb));
2673
2674        return segs;
2675}
2676EXPORT_SYMBOL(skb_mac_gso_segment);
2677
2678
2679/* openvswitch calls this on rx path, so we need a different check.
2680 */
2681static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2682{
2683        if (tx_path)
2684                return skb->ip_summed != CHECKSUM_PARTIAL;
2685        else
2686                return skb->ip_summed == CHECKSUM_NONE;
2687}
2688
2689/**
2690 *      __skb_gso_segment - Perform segmentation on skb.
2691 *      @skb: buffer to segment
2692 *      @features: features for the output path (see dev->features)
2693 *      @tx_path: whether it is called in TX path
2694 *
2695 *      This function segments the given skb and returns a list of segments.
2696 *
2697 *      It may return NULL if the skb requires no segmentation.  This is
2698 *      only possible when GSO is used for verifying header integrity.
2699 *
2700 *      Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2701 */
2702struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2703                                  netdev_features_t features, bool tx_path)
2704{
2705        if (unlikely(skb_needs_check(skb, tx_path))) {
2706                int err;
2707
2708                skb_warn_bad_offload(skb);
2709
2710                err = skb_cow_head(skb, 0);
2711                if (err < 0)
2712                        return ERR_PTR(err);
2713        }
2714
2715        /* Only report GSO partial support if it will enable us to
2716         * support segmentation on this frame without needing additional
2717         * work.
2718         */
2719        if (features & NETIF_F_GSO_PARTIAL) {
2720                netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2721                struct net_device *dev = skb->dev;
2722
2723                partial_features |= dev->features & dev->gso_partial_features;
2724                if (!skb_gso_ok(skb, features | partial_features))
2725                        features &= ~NETIF_F_GSO_PARTIAL;
2726        }
2727
2728        BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2729                     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2730
2731        SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2732        SKB_GSO_CB(skb)->encap_level = 0;
2733
2734        skb_reset_mac_header(skb);
2735        skb_reset_mac_len(skb);
2736
2737        return skb_mac_gso_segment(skb, features);
2738}
2739EXPORT_SYMBOL(__skb_gso_segment);
2740
2741/* Take action when hardware reception checksum errors are detected. */
2742#ifdef CONFIG_BUG
2743void netdev_rx_csum_fault(struct net_device *dev)
2744{
2745        if (net_ratelimit()) {
2746                pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2747                dump_stack();
2748        }
2749}
2750EXPORT_SYMBOL(netdev_rx_csum_fault);
2751#endif
2752
2753/* Actually, we should eliminate this check as soon as we know, that:
2754 * 1. IOMMU is present and allows to map all the memory.
2755 * 2. No high memory really exists on this machine.
2756 */
2757
2758static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2759{
2760#ifdef CONFIG_HIGHMEM
2761        int i;
2762        if (!(dev->features & NETIF_F_HIGHDMA)) {
2763                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2764                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2765                        if (PageHighMem(skb_frag_page(frag)))
2766                                return 1;
2767                }
2768        }
2769
2770        if (PCI_DMA_BUS_IS_PHYS) {
2771                struct device *pdev = dev->dev.parent;
2772
2773                if (!pdev)
2774                        return 0;
2775                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2776                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2777                        dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2778                        if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2779                                return 1;
2780                }
2781        }
2782#endif
2783        return 0;
2784}
2785
2786/* If MPLS offload request, verify we are testing hardware MPLS features
2787 * instead of standard features for the netdev.
2788 */
2789#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2790static netdev_features_t net_mpls_features(struct sk_buff *skb,
2791                                           netdev_features_t features,
2792                                           __be16 type)
2793{
2794        if (eth_p_mpls(type))
2795                features &= skb->dev->mpls_features;
2796
2797        return features;
2798}
2799#else
2800static netdev_features_t net_mpls_features(struct sk_buff *skb,
2801                                           netdev_features_t features,
2802                                           __be16 type)
2803{
2804        return features;
2805}
2806#endif
2807
2808static netdev_features_t harmonize_features(struct sk_buff *skb,
2809        netdev_features_t features)
2810{
2811        int tmp;
2812        __be16 type;
2813
2814        type = skb_network_protocol(skb, &tmp);
2815        features = net_mpls_features(skb, features, type);
2816
2817        if (skb->ip_summed != CHECKSUM_NONE &&
2818            !can_checksum_protocol(features, type)) {
2819                features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2820        } else if (illegal_highdma(skb->dev, skb)) {
2821                features &= ~NETIF_F_SG;
2822        }
2823
2824        return features;
2825}
2826
2827netdev_features_t passthru_features_check(struct sk_buff *skb,
2828                                          struct net_device *dev,
2829                                          netdev_features_t features)
2830{
2831        return features;
2832}
2833EXPORT_SYMBOL(passthru_features_check);
2834
2835static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2836                                             struct net_device *dev,
2837                                             netdev_features_t features)
2838{
2839        return vlan_features_check(skb, features);
2840}
2841
2842static netdev_features_t gso_features_check(const struct sk_buff *skb,
2843                                            struct net_device *dev,
2844                                            netdev_features_t features)
2845{
2846        u16 gso_segs = skb_shinfo(skb)->gso_segs;
2847
2848        if (gso_segs > dev->gso_max_segs)
2849                return features & ~NETIF_F_GSO_MASK;
2850
2851        /* Support for GSO partial features requires software
2852         * intervention before we can actually process the packets
2853         * so we need to strip support for any partial features now
2854         * and we can pull them back in after we have partially
2855         * segmented the frame.
2856         */
2857        if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2858                features &= ~dev->gso_partial_features;
2859
2860        /* Make sure to clear the IPv4 ID mangling feature if the
2861         * IPv4 header has the potential to be fragmented.
2862         */
2863        if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2864                struct iphdr *iph = skb->encapsulation ?
2865                                    inner_ip_hdr(skb) : ip_hdr(skb);
2866
2867                if (!(iph->frag_off & htons(IP_DF)))
2868                        features &= ~NETIF_F_TSO_MANGLEID;
2869        }
2870
2871        return features;
2872}
2873
2874netdev_features_t netif_skb_features(struct sk_buff *skb)
2875{
2876        struct net_device *dev = skb->dev;
2877        netdev_features_t features = dev->features;
2878
2879        if (skb_is_gso(skb))
2880                features = gso_features_check(skb, dev, features);
2881
2882        /* If encapsulation offload request, verify we are testing
2883         * hardware encapsulation features instead of standard
2884         * features for the netdev
2885         */
2886        if (skb->encapsulation)
2887                features &= dev->hw_enc_features;
2888
2889        if (skb_vlan_tagged(skb))
2890                features = netdev_intersect_features(features,
2891                                                     dev->vlan_features |
2892                                                     NETIF_F_HW_VLAN_CTAG_TX |
2893                                                     NETIF_F_HW_VLAN_STAG_TX);
2894
2895        if (dev->netdev_ops->ndo_features_check)
2896                features &= dev->netdev_ops->ndo_features_check(skb, dev,
2897                                                                features);
2898        else
2899                features &= dflt_features_check(skb, dev, features);
2900
2901        return harmonize_features(skb, features);
2902}
2903EXPORT_SYMBOL(netif_skb_features);
2904
2905static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2906                    struct netdev_queue *txq, bool more)
2907{
2908        unsigned int len;
2909        int rc;
2910
2911        if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2912                dev_queue_xmit_nit(skb, dev);
2913
2914        len = skb->len;
2915        trace_net_dev_start_xmit(skb, dev);
2916        rc = netdev_start_xmit(skb, dev, txq, more);
2917        trace_net_dev_xmit(skb, rc, dev, len);
2918
2919        return rc;
2920}
2921
2922struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2923                                    struct netdev_queue *txq, int *ret)
2924{
2925        struct sk_buff *skb = first;
2926        int rc = NETDEV_TX_OK;
2927
2928        while (skb) {
2929                struct sk_buff *next = skb->next;
2930
2931                skb->next = NULL;
2932                rc = xmit_one(skb, dev, txq, next != NULL);
2933                if (unlikely(!dev_xmit_complete(rc))) {
2934                        skb->next = next;
2935                        goto out;
2936                }
2937
2938                skb = next;
2939                if (netif_xmit_stopped(txq) && skb) {
2940                        rc = NETDEV_TX_BUSY;
2941                        break;
2942                }
2943        }
2944
2945out:
2946        *ret = rc;
2947        return skb;
2948}
2949
2950static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2951                                          netdev_features_t features)
2952{
2953        if (skb_vlan_tag_present(skb) &&
2954            !vlan_hw_offload_capable(features, skb->vlan_proto))
2955                skb = __vlan_hwaccel_push_inside(skb);
2956        return skb;
2957}
2958
2959static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2960{
2961        netdev_features_t features;
2962
2963        features = netif_skb_features(skb);
2964        skb = validate_xmit_vlan(skb, features);
2965        if (unlikely(!skb))
2966                goto out_null;
2967
2968        if (netif_needs_gso(skb, features)) {
2969                struct sk_buff *segs;
2970
2971                segs = skb_gso_segment(skb, features);
2972                if (IS_ERR(segs)) {
2973                        goto out_kfree_skb;
2974                } else if (segs) {
2975                        consume_skb(skb);
2976                        skb = segs;
2977                }
2978        } else {
2979                if (skb_needs_linearize(skb, features) &&
2980                    __skb_linearize(skb))
2981                        goto out_kfree_skb;
2982
2983                /* If packet is not checksummed and device does not
2984                 * support checksumming for this protocol, complete
2985                 * checksumming here.
2986                 */
2987                if (skb->ip_summed == CHECKSUM_PARTIAL) {
2988                        if (skb->encapsulation)
2989                                skb_set_inner_transport_header(skb,
2990                                                               skb_checksum_start_offset(skb));
2991                        else
2992                                skb_set_transport_header(skb,
2993                                                         skb_checksum_start_offset(skb));
2994                        if (!(features & NETIF_F_CSUM_MASK) &&
2995                            skb_checksum_help(skb))
2996                                goto out_kfree_skb;
2997                }
2998        }
2999
3000        return skb;
3001
3002out_kfree_skb:
3003        kfree_skb(skb);
3004out_null:
3005        atomic_long_inc(&dev->tx_dropped);
3006        return NULL;
3007}
3008
3009struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
3010{
3011        struct sk_buff *next, *head = NULL, *tail;
3012
3013        for (; skb != NULL; skb = next) {
3014                next = skb->next;
3015                skb->next = NULL;
3016
3017                /* in case skb wont be segmented, point to itself */
3018                skb->prev = skb;
3019
3020                skb = validate_xmit_skb(skb, dev);
3021                if (!skb)
3022                        continue;
3023
3024                if (!head)
3025                        head = skb;
3026                else
3027                        tail->next = skb;
3028                /* If skb was segmented, skb->prev points to
3029                 * the last segment. If not, it still contains skb.
3030                 */
3031                tail = skb->prev;
3032        }
3033        return head;
3034}
3035
3036static void qdisc_pkt_len_init(struct sk_buff *skb)
3037{
3038        const struct skb_shared_info *shinfo = skb_shinfo(skb);
3039
3040        qdisc_skb_cb(skb)->pkt_len = skb->len;
3041
3042        /* To get more precise estimation of bytes sent on wire,
3043         * we add to pkt_len the headers size of all segments
3044         */
3045        if (shinfo->gso_size)  {
3046                unsigned int hdr_len;
3047                u16 gso_segs = shinfo->gso_segs;
3048
3049                /* mac layer + network layer */
3050                hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3051
3052                /* + transport layer */
3053                if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3054                        hdr_len += tcp_hdrlen(skb);
3055                else
3056                        hdr_len += sizeof(struct udphdr);
3057
3058                if (shinfo->gso_type & SKB_GSO_DODGY)
3059                        gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3060                                                shinfo->gso_size);
3061
3062                qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3063        }
3064}
3065
3066static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3067                                 struct net_device *dev,
3068                                 struct netdev_queue *txq)
3069{
3070        spinlock_t *root_lock = qdisc_lock(q);
3071        bool contended;
3072        int rc;
3073
3074        qdisc_calculate_pkt_len(skb, q);
3075        /*
3076         * Heuristic to force contended enqueues to serialize on a
3077         * separate lock before trying to get qdisc main lock.
3078         * This permits __QDISC___STATE_RUNNING owner to get the lock more
3079         * often and dequeue packets faster.
3080         */
3081        contended = qdisc_is_running(q);
3082        if (unlikely(contended))
3083                spin_lock(&q->busylock);
3084
3085        spin_lock(root_lock);
3086        if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3087                kfree_skb(skb);
3088                rc = NET_XMIT_DROP;
3089        } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3090                   qdisc_run_begin(q)) {
3091                /*
3092                 * This is a work-conserving queue; there are no old skbs
3093                 * waiting to be sent out; and the qdisc is not running -
3094                 * xmit the skb directly.
3095                 */
3096
3097                qdisc_bstats_update(q, skb);
3098
3099                if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3100                        if (unlikely(contended)) {
3101                                spin_unlock(&q->busylock);
3102                                contended = false;
3103                        }
3104                        __qdisc_run(q);
3105                } else
3106                        qdisc_run_end(q);
3107
3108                rc = NET_XMIT_SUCCESS;
3109        } else {
3110                rc = q->enqueue(skb, q) & NET_XMIT_MASK;
3111                if (qdisc_run_begin(q)) {
3112                        if (unlikely(contended)) {
3113                                spin_unlock(&q->busylock);
3114                                contended = false;
3115                        }
3116                        __qdisc_run(q);
3117                }
3118        }
3119        spin_unlock(root_lock);
3120        if (unlikely(contended))
3121                spin_unlock(&q->busylock);
3122        return rc;
3123}
3124
3125#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3126static void skb_update_prio(struct sk_buff *skb)
3127{
3128        struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3129
3130        if (!skb->priority && skb->sk && map) {
3131                unsigned int prioidx =
3132                        sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3133
3134                if (prioidx < map->priomap_len)
3135                        skb->priority = map->priomap[prioidx];
3136        }
3137}
3138#else
3139#define skb_update_prio(skb)
3140#endif
3141
3142DEFINE_PER_CPU(int, xmit_recursion);
3143EXPORT_SYMBOL(xmit_recursion);
3144
3145#define RECURSION_LIMIT 10
3146
3147/**
3148 *      dev_loopback_xmit - loop back @skb
3149 *      @net: network namespace this loopback is happening in
3150 *      @sk:  sk needed to be a netfilter okfn
3151 *      @skb: buffer to transmit
3152 */
3153int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3154{
3155        skb_reset_mac_header(skb);
3156        __skb_pull(skb, skb_network_offset(skb));
3157        skb->pkt_type = PACKET_LOOPBACK;
3158        skb->ip_summed = CHECKSUM_UNNECESSARY;
3159        WARN_ON(!skb_dst(skb));
3160        skb_dst_force(skb);
3161        netif_rx_ni(skb);
3162        return 0;
3163}
3164EXPORT_SYMBOL(dev_loopback_xmit);
3165
3166#ifdef CONFIG_NET_EGRESS
3167static struct sk_buff *
3168sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3169{
3170        struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3171        struct tcf_result cl_res;
3172
3173        if (!cl)
3174                return skb;
3175
3176        /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3177         * earlier by the caller.
3178         */
3179        qdisc_bstats_cpu_update(cl->q, skb);
3180
3181        switch (tc_classify(skb, cl, &cl_res, false)) {
3182        case TC_ACT_OK:
3183        case TC_ACT_RECLASSIFY:
3184                skb->tc_index = TC_H_MIN(cl_res.classid);
3185                break;
3186        case TC_ACT_SHOT:
3187                qdisc_qstats_cpu_drop(cl->q);
3188                *ret = NET_XMIT_DROP;
3189                kfree_skb(skb);
3190                return NULL;
3191        case TC_ACT_STOLEN:
3192        case TC_ACT_QUEUED:
3193                *ret = NET_XMIT_SUCCESS;
3194                consume_skb(skb);
3195                return NULL;
3196        case TC_ACT_REDIRECT:
3197                /* No need to push/pop skb's mac_header here on egress! */
3198                skb_do_redirect(skb);
3199                *ret = NET_XMIT_SUCCESS;
3200                return NULL;
3201        default:
3202                break;
3203        }
3204
3205        return skb;
3206}
3207#endif /* CONFIG_NET_EGRESS */
3208
3209static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3210{
3211#ifdef CONFIG_XPS
3212        struct xps_dev_maps *dev_maps;
3213        struct xps_map *map;
3214        int queue_index = -1;
3215
3216        rcu_read_lock();
3217        dev_maps = rcu_dereference(dev->xps_maps);
3218        if (dev_maps) {
3219                map = rcu_dereference(
3220                    dev_maps->cpu_map[skb->sender_cpu - 1]);
3221                if (map) {
3222                        if (map->len == 1)
3223                                queue_index = map->queues[0];
3224                        else
3225                                queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3226                                                                           map->len)];
3227                        if (unlikely(queue_index >= dev->real_num_tx_queues))
3228                                queue_index = -1;
3229                }
3230        }
3231        rcu_read_unlock();
3232
3233        return queue_index;
3234#else
3235        return -1;
3236#endif
3237}
3238
3239static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3240{
3241        struct sock *sk = skb->sk;
3242        int queue_index = sk_tx_queue_get(sk);
3243
3244        if (queue_index < 0 || skb->ooo_okay ||
3245            queue_index >= dev->real_num_tx_queues) {
3246                int new_index = get_xps_queue(dev, skb);
3247                if (new_index < 0)
3248                        new_index = skb_tx_hash(dev, skb);
3249
3250                if (queue_index != new_index && sk &&
3251                    sk_fullsock(sk) &&
3252                    rcu_access_pointer(sk->sk_dst_cache))
3253                        sk_tx_queue_set(sk, new_index);
3254
3255                queue_index = new_index;
3256        }
3257
3258        return queue_index;
3259}
3260
3261struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3262                                    struct sk_buff *skb,
3263                                    void *accel_priv)
3264{
3265        int queue_index = 0;
3266
3267#ifdef CONFIG_XPS
3268        u32 sender_cpu = skb->sender_cpu - 1;
3269
3270        if (sender_cpu >= (u32)NR_CPUS)
3271                skb->sender_cpu = raw_smp_processor_id() + 1;
3272#endif
3273
3274        if (dev->real_num_tx_queues != 1) {
3275                const struct net_device_ops *ops = dev->netdev_ops;
3276                if (ops->ndo_select_queue)
3277                        queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3278                                                            __netdev_pick_tx);
3279                else
3280                        queue_index = __netdev_pick_tx(dev, skb);
3281
3282                if (!accel_priv)
3283                        queue_index = netdev_cap_txqueue(dev, queue_index);
3284        }
3285
3286        skb_set_queue_mapping(skb, queue_index);
3287        return netdev_get_tx_queue(dev, queue_index);
3288}
3289
3290/**
3291 *      __dev_queue_xmit - transmit a buffer
3292 *      @skb: buffer to transmit
3293 *      @accel_priv: private data used for L2 forwarding offload
3294 *
3295 *      Queue a buffer for transmission to a network device. The caller must
3296 *      have set the device and priority and built the buffer before calling
3297 *      this function. The function can be called from an interrupt.
3298 *
3299 *      A negative errno code is returned on a failure. A success does not
3300 *      guarantee the frame will be transmitted as it may be dropped due
3301 *      to congestion or traffic shaping.
3302 *
3303 * -----------------------------------------------------------------------------------
3304 *      I notice this method can also return errors from the queue disciplines,
3305 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3306 *      be positive.
3307 *
3308 *      Regardless of the return value, the skb is consumed, so it is currently
3309 *      difficult to retry a send to this method.  (You can bump the ref count
3310 *      before sending to hold a reference for retry if you are careful.)
3311 *
3312 *      When calling this method, interrupts MUST be enabled.  This is because
3313 *      the BH enable code must have IRQs enabled so that it will not deadlock.
3314 *          --BLG
3315 */
3316static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3317{
3318        struct net_device *dev = skb->dev;
3319        struct netdev_queue *txq;
3320        struct Qdisc *q;
3321        int rc = -ENOMEM;
3322
3323        skb_reset_mac_header(skb);
3324
3325        if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3326                __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3327
3328        /* Disable soft irqs for various locks below. Also
3329         * stops preemption for RCU.
3330         */
3331        rcu_read_lock_bh();
3332
3333        skb_update_prio(skb);
3334
3335        qdisc_pkt_len_init(skb);
3336#ifdef CONFIG_NET_CLS_ACT
3337        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3338# ifdef CONFIG_NET_EGRESS
3339        if (static_key_false(&egress_needed)) {
3340                skb = sch_handle_egress(skb, &rc, dev);
3341                if (!skb)
3342                        goto out;
3343        }
3344# endif
3345#endif
3346        /* If device/qdisc don't need skb->dst, release it right now while
3347         * its hot in this cpu cache.
3348         */
3349        if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3350                skb_dst_drop(skb);
3351        else
3352                skb_dst_force(skb);
3353
3354#ifdef CONFIG_NET_SWITCHDEV
3355        /* Don't forward if offload device already forwarded */
3356        if (skb->offload_fwd_mark &&
3357            skb->offload_fwd_mark == dev->offload_fwd_mark) {
3358                consume_skb(skb);
3359                rc = NET_XMIT_SUCCESS;
3360                goto out;
3361        }
3362#endif
3363
3364        txq = netdev_pick_tx(dev, skb, accel_priv);
3365        q = rcu_dereference_bh(txq->qdisc);
3366
3367        trace_net_dev_queue(skb);
3368        if (q->enqueue) {
3369                rc = __dev_xmit_skb(skb, q, dev, txq);
3370                goto out;
3371        }
3372
3373        /* The device has no queue. Common case for software devices:
3374           loopback, all the sorts of tunnels...
3375
3376           Really, it is unlikely that netif_tx_lock protection is necessary
3377           here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3378           counters.)
3379           However, it is possible, that they rely on protection
3380           made by us here.
3381
3382           Check this and shot the lock. It is not prone from deadlocks.
3383           Either shot noqueue qdisc, it is even simpler 8)
3384         */
3385        if (dev->flags & IFF_UP) {
3386                int cpu = smp_processor_id(); /* ok because BHs are off */
3387
3388                if (txq->xmit_lock_owner != cpu) {
3389
3390                        if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
3391                                goto recursion_alert;
3392
3393                        skb = validate_xmit_skb(skb, dev);
3394                        if (!skb)
3395                                goto out;
3396
3397                        HARD_TX_LOCK(dev, txq, cpu);
3398
3399                        if (!netif_xmit_stopped(txq)) {
3400                                __this_cpu_inc(xmit_recursion);
3401                                skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3402                                __this_cpu_dec(xmit_recursion);
3403                                if (dev_xmit_complete(rc)) {
3404                                        HARD_TX_UNLOCK(dev, txq);
3405                                        goto out;
3406                                }
3407                        }
3408                        HARD_TX_UNLOCK(dev, txq);
3409                        net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3410                                             dev->name);
3411                } else {
3412                        /* Recursion is detected! It is possible,
3413                         * unfortunately
3414                         */
3415recursion_alert:
3416                        net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3417                                             dev->name);
3418                }
3419        }
3420
3421        rc = -ENETDOWN;
3422        rcu_read_unlock_bh();
3423
3424        atomic_long_inc(&dev->tx_dropped);
3425        kfree_skb_list(skb);
3426        return rc;
3427out:
3428        rcu_read_unlock_bh();
3429        return rc;
3430}
3431
3432int dev_queue_xmit(struct sk_buff *skb)
3433{
3434        return __dev_queue_xmit(skb, NULL);
3435}
3436EXPORT_SYMBOL(dev_queue_xmit);
3437
3438int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3439{
3440        return __dev_queue_xmit(skb, accel_priv);
3441}
3442EXPORT_SYMBOL(dev_queue_xmit_accel);
3443
3444
3445/*=======================================================================
3446                        Receiver routines
3447  =======================================================================*/
3448
3449int netdev_max_backlog __read_mostly = 1000;
3450EXPORT_SYMBOL(netdev_max_backlog);
3451
3452int netdev_tstamp_prequeue __read_mostly = 1;
3453int netdev_budget __read_mostly = 300;
3454int weight_p __read_mostly = 64;            /* old backlog weight */
3455
3456/* Called with irq disabled */
3457static inline void ____napi_schedule(struct softnet_data *sd,
3458                                     struct napi_struct *napi)
3459{
3460        list_add_tail(&napi->poll_list, &sd->poll_list);
3461        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3462}
3463
3464#ifdef CONFIG_RPS
3465
3466/* One global table that all flow-based protocols share. */
3467struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3468EXPORT_SYMBOL(rps_sock_flow_table);
3469u32 rps_cpu_mask __read_mostly;
3470EXPORT_SYMBOL(rps_cpu_mask);
3471
3472struct static_key rps_needed __read_mostly;
3473EXPORT_SYMBOL(rps_needed);
3474
3475static struct rps_dev_flow *
3476set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3477            struct rps_dev_flow *rflow, u16 next_cpu)
3478{
3479        if (next_cpu < nr_cpu_ids) {
3480#ifdef CONFIG_RFS_ACCEL
3481                struct netdev_rx_queue *rxqueue;
3482                struct rps_dev_flow_table *flow_table;
3483                struct rps_dev_flow *old_rflow;
3484                u32 flow_id;
3485                u16 rxq_index;
3486                int rc;
3487
3488                /* Should we steer this flow to a different hardware queue? */
3489                if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3490                    !(dev->features & NETIF_F_NTUPLE))
3491                        goto out;
3492                rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3493                if (rxq_index == skb_get_rx_queue(skb))
3494                        goto out;
3495
3496                rxqueue = dev->_rx + rxq_index;
3497                flow_table = rcu_dereference(rxqueue->rps_flow_table);
3498                if (!flow_table)
3499                        goto out;
3500                flow_id = skb_get_hash(skb) & flow_table->mask;
3501                rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3502                                                        rxq_index, flow_id);
3503                if (rc < 0)
3504                        goto out;
3505                old_rflow = rflow;
3506                rflow = &flow_table->flows[flow_id];
3507                rflow->filter = rc;
3508                if (old_rflow->filter == rflow->filter)
3509                        old_rflow->filter = RPS_NO_FILTER;
3510        out:
3511#endif
3512                rflow->last_qtail =
3513                        per_cpu(softnet_data, next_cpu).input_queue_head;
3514        }
3515
3516        rflow->cpu = next_cpu;
3517        return rflow;
3518}
3519
3520/*
3521 * get_rps_cpu is called from netif_receive_skb and returns the target
3522 * CPU from the RPS map of the receiving queue for a given skb.
3523 * rcu_read_lock must be held on entry.
3524 */
3525static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3526                       struct rps_dev_flow **rflowp)
3527{
3528        const struct rps_sock_flow_table *sock_flow_table;
3529        struct netdev_rx_queue *rxqueue = dev->_rx;
3530        struct rps_dev_flow_table *flow_table;
3531        struct rps_map *map;
3532        int cpu = -1;
3533        u32 tcpu;
3534        u32 hash;
3535
3536        if (skb_rx_queue_recorded(skb)) {
3537                u16 index = skb_get_rx_queue(skb);
3538
3539                if (unlikely(index >= dev->real_num_rx_queues)) {
3540                        WARN_ONCE(dev->real_num_rx_queues > 1,
3541                                  "%s received packet on queue %u, but number "
3542                                  "of RX queues is %u\n",
3543                                  dev->name, index, dev->real_num_rx_queues);
3544                        goto done;
3545                }
3546                rxqueue += index;
3547        }
3548
3549        /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3550
3551        flow_table = rcu_dereference(rxqueue->rps_flow_table);
3552        map = rcu_dereference(rxqueue->rps_map);
3553        if (!flow_table && !map)
3554                goto done;
3555
3556        skb_reset_network_header(skb);
3557        hash = skb_get_hash(skb);
3558        if (!hash)
3559                goto done;
3560
3561        sock_flow_table = rcu_dereference(rps_sock_flow_table);
3562        if (flow_table && sock_flow_table) {
3563                struct rps_dev_flow *rflow;
3564                u32 next_cpu;
3565                u32 ident;
3566
3567                /* First check into global flow table if there is a match */
3568                ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3569                if ((ident ^ hash) & ~rps_cpu_mask)
3570                        goto try_rps;
3571
3572                next_cpu = ident & rps_cpu_mask;
3573
3574                /* OK, now we know there is a match,
3575                 * we can look at the local (per receive queue) flow table
3576                 */
3577                rflow = &flow_table->flows[hash & flow_table->mask];
3578                tcpu = rflow->cpu;
3579
3580                /*
3581                 * If the desired CPU (where last recvmsg was done) is
3582                 * different from current CPU (one in the rx-queue flow
3583                 * table entry), switch if one of the following holds:
3584                 *   - Current CPU is unset (>= nr_cpu_ids).
3585                 *   - Current CPU is offline.
3586                 *   - The current CPU's queue tail has advanced beyond the
3587                 *     last packet that was enqueued using this table entry.
3588                 *     This guarantees that all previous packets for the flow
3589                 *     have been dequeued, thus preserving in order delivery.
3590                 */
3591                if (unlikely(tcpu != next_cpu) &&
3592                    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3593                     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3594                      rflow->last_qtail)) >= 0)) {
3595                        tcpu = next_cpu;
3596                        rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3597                }
3598
3599                if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3600                        *rflowp = rflow;
3601                        cpu = tcpu;
3602                        goto done;
3603                }
3604        }
3605
3606try_rps:
3607
3608        if (map) {
3609                tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3610                if (cpu_online(tcpu)) {
3611                        cpu = tcpu;
3612                        goto done;
3613                }
3614        }
3615
3616done:
3617        return cpu;
3618}
3619
3620#ifdef CONFIG_RFS_ACCEL
3621
3622/**
3623 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3624 * @dev: Device on which the filter was set
3625 * @rxq_index: RX queue index
3626 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3627 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3628 *
3629 * Drivers that implement ndo_rx_flow_steer() should periodically call
3630 * this function for each installed filter and remove the filters for
3631 * which it returns %true.
3632 */
3633bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3634                         u32 flow_id, u16 filter_id)
3635{
3636        struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3637        struct rps_dev_flow_table *flow_table;
3638        struct rps_dev_flow *rflow;
3639        bool expire = true;
3640        unsigned int cpu;
3641
3642        rcu_read_lock();
3643        flow_table = rcu_dereference(rxqueue->rps_flow_table);
3644        if (flow_table && flow_id <= flow_table->mask) {
3645                rflow = &flow_table->flows[flow_id];
3646                cpu = ACCESS_ONCE(rflow->cpu);
3647                if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3648                    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3649                           rflow->last_qtail) <
3650                     (int)(10 * flow_table->mask)))
3651                        expire = false;
3652        }
3653        rcu_read_unlock();
3654        return expire;
3655}
3656EXPORT_SYMBOL(rps_may_expire_flow);
3657
3658#endif /* CONFIG_RFS_ACCEL */
3659
3660/* Called from hardirq (IPI) context */
3661static void rps_trigger_softirq(void *data)
3662{
3663        struct softnet_data *sd = data;
3664
3665        ____napi_schedule(sd, &sd->backlog);
3666        sd->received_rps++;
3667}
3668
3669#endif /* CONFIG_RPS */
3670
3671/*
3672 * Check if this softnet_data structure is another cpu one
3673 * If yes, queue it to our IPI list and return 1
3674 * If no, return 0
3675 */
3676static int rps_ipi_queued(struct softnet_data *sd)
3677{
3678#ifdef CONFIG_RPS
3679        struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3680
3681        if (sd != mysd) {
3682                sd->rps_ipi_next = mysd->rps_ipi_list;
3683                mysd->rps_ipi_list = sd;
3684
3685                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3686                return 1;
3687        }
3688#endif /* CONFIG_RPS */
3689        return 0;
3690}
3691
3692#ifdef CONFIG_NET_FLOW_LIMIT
3693int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3694#endif
3695
3696static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3697{
3698#ifdef CONFIG_NET_FLOW_LIMIT
3699        struct sd_flow_limit *fl;
3700        struct softnet_data *sd;
3701        unsigned int old_flow, new_flow;
3702
3703        if (qlen < (netdev_max_backlog >> 1))
3704                return false;
3705
3706        sd = this_cpu_ptr(&softnet_data);
3707
3708        rcu_read_lock();
3709        fl = rcu_dereference(sd->flow_limit);
3710        if (fl) {
3711                new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3712                old_flow = fl->history[fl->history_head];
3713                fl->history[fl->history_head] = new_flow;
3714
3715                fl->history_head++;
3716                fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3717
3718                if (likely(fl->buckets[old_flow]))
3719                        fl->buckets[old_flow]--;
3720
3721                if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3722                        fl->count++;
3723                        rcu_read_unlock();
3724                        return true;
3725                }
3726        }
3727        rcu_read_unlock();
3728#endif
3729        return false;
3730}
3731
3732/*
3733 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3734 * queue (may be a remote CPU queue).
3735 */
3736static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3737                              unsigned int *qtail)
3738{
3739        struct softnet_data *sd;
3740        unsigned long flags;
3741        unsigned int qlen;
3742
3743        sd = &per_cpu(softnet_data, cpu);
3744
3745        local_irq_save(flags);
3746
3747        rps_lock(sd);
3748        if (!netif_running(skb->dev))
3749                goto drop;
3750        qlen = skb_queue_len(&sd->input_pkt_queue);
3751        if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3752                if (qlen) {
3753enqueue:
3754                        __skb_queue_tail(&sd->input_pkt_queue, skb);
3755                        input_queue_tail_incr_save(sd, qtail);
3756                        rps_unlock(sd);
3757                        local_irq_restore(flags);
3758                        return NET_RX_SUCCESS;
3759                }
3760
3761                /* Schedule NAPI for backlog device
3762                 * We can use non atomic operation since we own the queue lock
3763                 */
3764                if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3765                        if (!rps_ipi_queued(sd))
3766                                ____napi_schedule(sd, &sd->backlog);
3767                }
3768                goto enqueue;
3769        }
3770
3771drop:
3772        sd->dropped++;
3773        rps_unlock(sd);
3774
3775        local_irq_restore(flags);
3776
3777        atomic_long_inc(&skb->dev->rx_dropped);
3778        kfree_skb(skb);
3779        return NET_RX_DROP;
3780}
3781
3782static int netif_rx_internal(struct sk_buff *skb)
3783{
3784        int ret;
3785
3786        net_timestamp_check(netdev_tstamp_prequeue, skb);
3787
3788        trace_netif_rx(skb);
3789#ifdef CONFIG_RPS
3790        if (static_key_false(&rps_needed)) {
3791                struct rps_dev_flow voidflow, *rflow = &voidflow;
3792                int cpu;
3793
3794                preempt_disable();
3795                rcu_read_lock();
3796
3797                cpu = get_rps_cpu(skb->dev, skb, &rflow);
3798                if (cpu < 0)
3799                        cpu = smp_processor_id();
3800
3801                ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3802
3803                rcu_read_unlock();
3804                preempt_enable();
3805        } else
3806#endif
3807        {
3808                unsigned int qtail;
3809                ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3810                put_cpu();
3811        }
3812        return ret;
3813}
3814
3815/**
3816 *      netif_rx        -       post buffer to the network code
3817 *      @skb: buffer to post
3818 *
3819 *      This function receives a packet from a device driver and queues it for
3820 *      the upper (protocol) levels to process.  It always succeeds. The buffer
3821 *      may be dropped during processing for congestion control or by the
3822 *      protocol layers.
3823 *
3824 *      return values:
3825 *      NET_RX_SUCCESS  (no congestion)
3826 *      NET_RX_DROP     (packet was dropped)
3827 *
3828 */
3829
3830int netif_rx(struct sk_buff *skb)
3831{
3832        trace_netif_rx_entry(skb);
3833
3834        return netif_rx_internal(skb);
3835}
3836EXPORT_SYMBOL(netif_rx);
3837
3838int netif_rx_ni(struct sk_buff *skb)
3839{
3840        int err;
3841
3842        trace_netif_rx_ni_entry(skb);
3843
3844        preempt_disable();
3845        err = netif_rx_internal(skb);
3846        if (local_softirq_pending())
3847                do_softirq();
3848        preempt_enable();
3849
3850        return err;
3851}
3852EXPORT_SYMBOL(netif_rx_ni);
3853
3854static void net_tx_action(struct softirq_action *h)
3855{
3856        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3857
3858        if (sd->completion_queue) {
3859                struct sk_buff *clist;
3860
3861                local_irq_disable();
3862                clist = sd->completion_queue;
3863                sd->completion_queue = NULL;
3864                local_irq_enable();
3865
3866                while (clist) {
3867                        struct sk_buff *skb = clist;
3868                        clist = clist->next;
3869
3870                        WARN_ON(atomic_read(&skb->users));
3871                        if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3872                                trace_consume_skb(skb);
3873                        else
3874                                trace_kfree_skb(skb, net_tx_action);
3875
3876                        if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3877                                __kfree_skb(skb);
3878                        else
3879                                __kfree_skb_defer(skb);
3880                }
3881
3882                __kfree_skb_flush();
3883        }
3884
3885        if (sd->output_queue) {
3886                struct Qdisc *head;
3887
3888                local_irq_disable();
3889                head = sd->output_queue;
3890                sd->output_queue = NULL;
3891                sd->output_queue_tailp = &sd->output_queue;
3892                local_irq_enable();
3893
3894                while (head) {
3895                        struct Qdisc *q = head;
3896                        spinlock_t *root_lock;
3897
3898                        head = head->next_sched;
3899
3900                        root_lock = qdisc_lock(q);
3901                        if (spin_trylock(root_lock)) {
3902                                smp_mb__before_atomic();
3903                                clear_bit(__QDISC_STATE_SCHED,
3904                                          &q->state);
3905                                qdisc_run(q);
3906                                spin_unlock(root_lock);
3907                        } else {
3908                                if (!test_bit(__QDISC_STATE_DEACTIVATED,
3909                                              &q->state)) {
3910                                        __netif_reschedule(q);
3911                                } else {
3912                                        smp_mb__before_atomic();
3913                                        clear_bit(__QDISC_STATE_SCHED,
3914                                                  &q->state);
3915                                }
3916                        }
3917                }
3918        }
3919}
3920
3921#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3922    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3923/* This hook is defined here for ATM LANE */
3924int (*br_fdb_test_addr_hook)(struct net_device *dev,
3925                             unsigned char *addr) __read_mostly;
3926EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3927#endif
3928
3929static inline struct sk_buff *
3930sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3931                   struct net_device *orig_dev)
3932{
3933#ifdef CONFIG_NET_CLS_ACT
3934        struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3935        struct tcf_result cl_res;
3936
3937        /* If there's at least one ingress present somewhere (so
3938         * we get here via enabled static key), remaining devices
3939         * that are not configured with an ingress qdisc will bail
3940         * out here.
3941         */
3942        if (!cl)
3943                return skb;
3944        if (*pt_prev) {
3945                *ret = deliver_skb(skb, *pt_prev, orig_dev);
3946                *pt_prev = NULL;
3947        }
3948
3949        qdisc_skb_cb(skb)->pkt_len = skb->len;
3950        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3951        qdisc_bstats_cpu_update(cl->q, skb);
3952
3953        switch (tc_classify(skb, cl, &cl_res, false)) {
3954        case TC_ACT_OK:
3955        case TC_ACT_RECLASSIFY:
3956                skb->tc_index = TC_H_MIN(cl_res.classid);
3957                break;
3958        case TC_ACT_SHOT:
3959                qdisc_qstats_cpu_drop(cl->q);
3960                kfree_skb(skb);
3961                return NULL;
3962        case TC_ACT_STOLEN:
3963        case TC_ACT_QUEUED:
3964                consume_skb(skb);
3965                return NULL;
3966        case TC_ACT_REDIRECT:
3967                /* skb_mac_header check was done by cls/act_bpf, so
3968                 * we can safely push the L2 header back before
3969                 * redirecting to another netdev
3970                 */
3971                __skb_push(skb, skb->mac_len);
3972                skb_do_redirect(skb);
3973                return NULL;
3974        default:
3975                break;
3976        }
3977#endif /* CONFIG_NET_CLS_ACT */
3978        return skb;
3979}
3980
3981/**
3982 *      netdev_rx_handler_register - register receive handler
3983 *      @dev: device to register a handler for
3984 *      @rx_handler: receive handler to register
3985 *      @rx_handler_data: data pointer that is used by rx handler
3986 *
3987 *      Register a receive handler for a device. This handler will then be
3988 *      called from __netif_receive_skb. A negative errno code is returned
3989 *      on a failure.
3990 *
3991 *      The caller must hold the rtnl_mutex.
3992 *
3993 *      For a general description of rx_handler, see enum rx_handler_result.
3994 */
3995int netdev_rx_handler_register(struct net_device *dev,
3996                               rx_handler_func_t *rx_handler,
3997                               void *rx_handler_data)
3998{
3999        ASSERT_RTNL();
4000
4001        if (dev->rx_handler)
4002                return -EBUSY;
4003
4004        /* Note: rx_handler_data must be set before rx_handler */
4005        rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4006        rcu_assign_pointer(dev->rx_handler, rx_handler);
4007
4008        return 0;
4009}
4010EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4011
4012/**
4013 *      netdev_rx_handler_unregister - unregister receive handler
4014 *      @dev: device to unregister a handler from
4015 *
4016 *      Unregister a receive handler from a device.
4017 *
4018 *      The caller must hold the rtnl_mutex.
4019 */
4020void netdev_rx_handler_unregister(struct net_device *dev)
4021{
4022
4023        ASSERT_RTNL();
4024        RCU_INIT_POINTER(dev->rx_handler, NULL);
4025        /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4026         * section has a guarantee to see a non NULL rx_handler_data
4027         * as well.
4028         */
4029        synchronize_net();
4030        RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4031}
4032EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4033
4034/*
4035 * Limit the use of PFMEMALLOC reserves to those protocols that implement
4036 * the special handling of PFMEMALLOC skbs.
4037 */
4038static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4039{
4040        switch (skb->protocol) {
4041        case htons(ETH_P_ARP):
4042        case htons(ETH_P_IP):
4043        case htons(ETH_P_IPV6):
4044        case htons(ETH_P_8021Q):
4045        case htons(ETH_P_8021AD):
4046                return true;
4047        default:
4048                return false;
4049        }
4050}
4051
4052static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4053                             int *ret, struct net_device *orig_dev)
4054{
4055#ifdef CONFIG_NETFILTER_INGRESS
4056        if (nf_hook_ingress_active(skb)) {
4057                if (*pt_prev) {
4058                        *ret = deliver_skb(skb, *pt_prev, orig_dev);
4059                        *pt_prev = NULL;
4060                }
4061
4062                return nf_hook_ingress(skb);
4063        }
4064#endif /* CONFIG_NETFILTER_INGRESS */
4065        return 0;
4066}
4067
4068static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4069{
4070        struct packet_type *ptype, *pt_prev;
4071        rx_handler_func_t *rx_handler;
4072        struct net_device *orig_dev;
4073        bool deliver_exact = false;
4074        int ret = NET_RX_DROP;
4075        __be16 type;
4076
4077        net_timestamp_check(!netdev_tstamp_prequeue, skb);
4078
4079        trace_netif_receive_skb(skb);
4080
4081        orig_dev = skb->dev;
4082
4083        skb_reset_network_header(skb);
4084        if (!skb_transport_header_was_set(skb))
4085                skb_reset_transport_header(skb);
4086        skb_reset_mac_len(skb);
4087
4088        pt_prev = NULL;
4089
4090another_round:
4091        skb->skb_iif = skb->dev->ifindex;
4092
4093        __this_cpu_inc(softnet_data.processed);
4094
4095        if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4096            skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4097                skb = skb_vlan_untag(skb);
4098                if (unlikely(!skb))
4099                        goto out;
4100        }
4101
4102#ifdef CONFIG_NET_CLS_ACT
4103        if (skb->tc_verd & TC_NCLS) {
4104                skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4105                goto ncls;
4106        }
4107#endif
4108
4109        if (pfmemalloc)
4110                goto skip_taps;
4111
4112        list_for_each_entry_rcu(ptype, &ptype_all, list) {
4113                if (pt_prev)
4114                        ret = deliver_skb(skb, pt_prev, orig_dev);
4115                pt_prev = ptype;
4116        }
4117
4118        list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4119                if (pt_prev)
4120                        ret = deliver_skb(skb, pt_prev, orig_dev);
4121                pt_prev = ptype;
4122        }
4123
4124skip_taps:
4125#ifdef CONFIG_NET_INGRESS
4126        if (static_key_false(&ingress_needed)) {
4127                skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4128                if (!skb)
4129                        goto out;
4130
4131                if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4132                        goto out;
4133        }
4134#endif
4135#ifdef CONFIG_NET_CLS_ACT
4136        skb->tc_verd = 0;
4137ncls:
4138#endif
4139        if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4140                goto drop;
4141
4142        if (skb_vlan_tag_present(skb)) {
4143                if (pt_prev) {
4144                        ret = deliver_skb(skb, pt_prev, orig_dev);
4145                        pt_prev = NULL;
4146                }
4147                if (vlan_do_receive(&skb))
4148                        goto another_round;
4149                else if (unlikely(!skb))
4150                        goto out;
4151        }
4152
4153        rx_handler = rcu_dereference(skb->dev->rx_handler);
4154        if (rx_handler) {
4155                if (pt_prev) {
4156                        ret = deliver_skb(skb, pt_prev, orig_dev);
4157                        pt_prev = NULL;
4158                }
4159                switch (rx_handler(&skb)) {
4160                case RX_HANDLER_CONSUMED:
4161                        ret = NET_RX_SUCCESS;
4162                        goto out;
4163                case RX_HANDLER_ANOTHER:
4164                        goto another_round;
4165                case RX_HANDLER_EXACT:
4166                        deliver_exact = true;
4167                case RX_HANDLER_PASS:
4168                        break;
4169                default:
4170                        BUG();
4171                }
4172        }
4173
4174        if (unlikely(skb_vlan_tag_present(skb))) {
4175                if (skb_vlan_tag_get_id(skb))
4176                        skb->pkt_type = PACKET_OTHERHOST;
4177                /* Note: we might in the future use prio bits
4178                 * and set skb->priority like in vlan_do_receive()
4179                 * For the time being, just ignore Priority Code Point
4180                 */
4181                skb->vlan_tci = 0;
4182        }
4183
4184        type = skb->protocol;
4185
4186        /* deliver only exact match when indicated */
4187        if (likely(!deliver_exact)) {
4188                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4189                                       &ptype_base[ntohs(type) &
4190                                                   PTYPE_HASH_MASK]);
4191        }
4192
4193        deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4194                               &orig_dev->ptype_specific);
4195
4196        if (unlikely(skb->dev != orig_dev)) {
4197                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4198                                       &skb->dev->ptype_specific);
4199        }
4200
4201        if (pt_prev) {
4202                if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4203                        goto drop;
4204                else
4205                        ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4206        } else {
4207drop:
4208                if (!deliver_exact)
4209                        atomic_long_inc(&skb->dev->rx_dropped);
4210                else
4211                        atomic_long_inc(&skb->dev->rx_nohandler);
4212                kfree_skb(skb);
4213                /* Jamal, now you will not able to escape explaining
4214                 * me how you were going to use this. :-)
4215                 */
4216                ret = NET_RX_DROP;
4217        }
4218
4219out:
4220        return ret;
4221}
4222
4223static int __netif_receive_skb(struct sk_buff *skb)
4224{
4225        int ret;
4226
4227        if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4228                unsigned long pflags = current->flags;
4229
4230                /*
4231                 * PFMEMALLOC skbs are special, they should
4232                 * - be delivered to SOCK_MEMALLOC sockets only
4233                 * - stay away from userspace
4234                 * - have bounded memory usage
4235                 *
4236                 * Use PF_MEMALLOC as this saves us from propagating the allocation
4237                 * context down to all allocation sites.
4238                 */
4239                current->flags |= PF_MEMALLOC;
4240                ret = __netif_receive_skb_core(skb, true);
4241                tsk_restore_flags(current, pflags, PF_MEMALLOC);
4242        } else
4243                ret = __netif_receive_skb_core(skb, false);
4244
4245        return ret;
4246}
4247
4248static int netif_receive_skb_internal(struct sk_buff *skb)
4249{
4250        int ret;
4251
4252        net_timestamp_check(netdev_tstamp_prequeue, skb);
4253
4254        if (skb_defer_rx_timestamp(skb))
4255                return NET_RX_SUCCESS;
4256
4257        rcu_read_lock();
4258
4259#ifdef CONFIG_RPS
4260        if (static_key_false(&rps_needed)) {
4261                struct rps_dev_flow voidflow, *rflow = &voidflow;
4262                int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4263
4264                if (cpu >= 0) {
4265                        ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4266                        rcu_read_unlock();
4267                        return ret;
4268                }
4269        }
4270#endif
4271        ret = __netif_receive_skb(skb);
4272        rcu_read_unlock();
4273        return ret;
4274}
4275
4276/**
4277 *      netif_receive_skb - process receive buffer from network
4278 *      @skb: buffer to process
4279 *
4280 *      netif_receive_skb() is the main receive data processing function.
4281 *      It always succeeds. The buffer may be dropped during processing
4282 *      for congestion control or by the protocol layers.
4283 *
4284 *      This function may only be called from softirq context and interrupts
4285 *      should be enabled.
4286 *
4287 *      Return values (usually ignored):
4288 *      NET_RX_SUCCESS: no congestion
4289 *      NET_RX_DROP: packet was dropped
4290 */
4291int netif_receive_skb(struct sk_buff *skb)
4292{
4293        trace_netif_receive_skb_entry(skb);
4294
4295        return netif_receive_skb_internal(skb);
4296}
4297EXPORT_SYMBOL(netif_receive_skb);
4298
4299/* Network device is going away, flush any packets still pending
4300 * Called with irqs disabled.
4301 */
4302static void flush_backlog(void *arg)
4303{
4304        struct net_device *dev = arg;
4305        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4306        struct sk_buff *skb, *tmp;
4307
4308        rps_lock(sd);
4309        skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4310                if (skb->dev == dev) {
4311                        __skb_unlink(skb, &sd->input_pkt_queue);
4312                        kfree_skb(skb);
4313                        input_queue_head_incr(sd);
4314                }
4315        }
4316        rps_unlock(sd);
4317
4318        skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4319                if (skb->dev == dev) {
4320                        __skb_unlink(skb, &sd->process_queue);
4321                        kfree_skb(skb);
4322                        input_queue_head_incr(sd);
4323                }
4324        }
4325}
4326
4327static int napi_gro_complete(struct sk_buff *skb)
4328{
4329        struct packet_offload *ptype;
4330        __be16 type = skb->protocol;
4331        struct list_head *head = &offload_base;
4332        int err = -ENOENT;
4333
4334        BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4335
4336        if (NAPI_GRO_CB(skb)->count == 1) {
4337                skb_shinfo(skb)->gso_size = 0;
4338                goto out;
4339        }
4340
4341        rcu_read_lock();
4342        list_for_each_entry_rcu(ptype, head, list) {
4343                if (ptype->type != type || !ptype->callbacks.gro_complete)
4344                        continue;
4345
4346                err = ptype->callbacks.gro_complete(skb, 0);
4347                break;
4348        }
4349        rcu_read_unlock();
4350
4351        if (err) {
4352                WARN_ON(&ptype->list == head);
4353                kfree_skb(skb);
4354                return NET_RX_SUCCESS;
4355        }
4356
4357out:
4358        return netif_receive_skb_internal(skb);
4359}
4360
4361/* napi->gro_list contains packets ordered by age.
4362 * youngest packets at the head of it.
4363 * Complete skbs in reverse order to reduce latencies.
4364 */
4365void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4366{
4367        struct sk_buff *skb, *prev = NULL;
4368
4369        /* scan list and build reverse chain */
4370        for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4371                skb->prev = prev;
4372                prev = skb;
4373        }
4374
4375        for (skb = prev; skb; skb = prev) {
4376                skb->next = NULL;
4377
4378                if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4379                        return;
4380
4381                prev = skb->prev;
4382                napi_gro_complete(skb);
4383                napi->gro_count--;
4384        }
4385
4386        napi->gro_list = NULL;
4387}
4388EXPORT_SYMBOL(napi_gro_flush);
4389
4390static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4391{
4392        struct sk_buff *p;
4393        unsigned int maclen = skb->dev->hard_header_len;
4394        u32 hash = skb_get_hash_raw(skb);
4395
4396        for (p = napi->gro_list; p; p = p->next) {
4397                unsigned long diffs;
4398
4399                NAPI_GRO_CB(p)->flush = 0;
4400
4401                if (hash != skb_get_hash_raw(p)) {
4402                        NAPI_GRO_CB(p)->same_flow = 0;
4403                        continue;
4404                }
4405
4406                diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4407                diffs |= p->vlan_tci ^ skb->vlan_tci;
4408                diffs |= skb_metadata_dst_cmp(p, skb);
4409                if (maclen == ETH_HLEN)
4410                        diffs |= compare_ether_header(skb_mac_header(p),
4411                                                      skb_mac_header(skb));
4412                else if (!diffs)
4413                        diffs = memcmp(skb_mac_header(p),
4414                                       skb_mac_header(skb),
4415                                       maclen);
4416                NAPI_GRO_CB(p)->same_flow = !diffs;
4417        }
4418}
4419
4420static void skb_gro_reset_offset(struct sk_buff *skb)
4421{
4422        const struct skb_shared_info *pinfo = skb_shinfo(skb);
4423        const skb_frag_t *frag0 = &pinfo->frags[0];
4424
4425        NAPI_GRO_CB(skb)->data_offset = 0;
4426        NAPI_GRO_CB(skb)->frag0 = NULL;
4427        NAPI_GRO_CB(skb)->frag0_len = 0;
4428
4429        if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4430            pinfo->nr_frags &&
4431            !PageHighMem(skb_frag_page(frag0))) {
4432                NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4433                NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
4434        }
4435}
4436
4437static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4438{
4439        struct skb_shared_info *pinfo = skb_shinfo(skb);
4440
4441        BUG_ON(skb->end - skb->tail < grow);
4442
4443        memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4444
4445        skb->data_len -= grow;
4446        skb->tail += grow;
4447
4448        pinfo->frags[0].page_offset += grow;
4449        skb_frag_size_sub(&pinfo->frags[0], grow);
4450
4451        if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4452                skb_frag_unref(skb, 0);
4453                memmove(pinfo->frags, pinfo->frags + 1,
4454                        --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4455        }
4456}
4457
4458static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4459{
4460        struct sk_buff **pp = NULL;
4461        struct packet_offload *ptype;
4462        __be16 type = skb->protocol;
4463        struct list_head *head = &offload_base;
4464        int same_flow;
4465        enum gro_result ret;
4466        int grow;
4467
4468        if (!(skb->dev->features & NETIF_F_GRO))
4469                goto normal;
4470
4471        if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4472                goto normal;
4473
4474        gro_list_prepare(napi, skb);
4475
4476        rcu_read_lock();
4477        list_for_each_entry_rcu(ptype, head, list) {
4478                if (ptype->type != type || !ptype->callbacks.gro_receive)
4479                        continue;
4480
4481                skb_set_network_header(skb, skb_gro_offset(skb));
4482                skb_reset_mac_len(skb);
4483                NAPI_GRO_CB(skb)->same_flow = 0;
4484                NAPI_GRO_CB(skb)->flush = 0;
4485                NAPI_GRO_CB(skb)->free = 0;
4486                NAPI_GRO_CB(skb)->encap_mark = 0;
4487                NAPI_GRO_CB(skb)->is_fou = 0;
4488                NAPI_GRO_CB(skb)->is_atomic = 1;
4489                NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4490
4491                /* Setup for GRO checksum validation */
4492                switch (skb->ip_summed) {
4493                case CHECKSUM_COMPLETE:
4494                        NAPI_GRO_CB(skb)->csum = skb->csum;
4495                        NAPI_GRO_CB(skb)->csum_valid = 1;
4496                        NAPI_GRO_CB(skb)->csum_cnt = 0;
4497                        break;
4498                case CHECKSUM_UNNECESSARY:
4499                        NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4500                        NAPI_GRO_CB(skb)->csum_valid = 0;
4501                        break;
4502                default:
4503                        NAPI_GRO_CB(skb)->csum_cnt = 0;
4504                        NAPI_GRO_CB(skb)->csum_valid = 0;
4505                }
4506
4507                pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4508                break;
4509        }
4510        rcu_read_unlock();
4511
4512        if (&ptype->list == head)
4513                goto normal;
4514
4515        same_flow = NAPI_GRO_CB(skb)->same_flow;
4516        ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4517
4518        if (pp) {
4519                struct sk_buff *nskb = *pp;
4520
4521                *pp = nskb->next;
4522                nskb->next = NULL;
4523                napi_gro_complete(nskb);
4524                napi->gro_count--;
4525        }
4526
4527        if (same_flow)
4528                goto ok;
4529
4530        if (NAPI_GRO_CB(skb)->flush)
4531                goto normal;
4532
4533        if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4534                struct sk_buff *nskb = napi->gro_list;
4535
4536                /* locate the end of the list to select the 'oldest' flow */
4537                while (nskb->next) {
4538                        pp = &nskb->next;
4539                        nskb = *pp;
4540                }
4541                *pp = NULL;
4542                nskb->next = NULL;
4543                napi_gro_complete(nskb);
4544        } else {
4545                napi->gro_count++;
4546        }
4547        NAPI_GRO_CB(skb)->count = 1;
4548        NAPI_GRO_CB(skb)->age = jiffies;
4549        NAPI_GRO_CB(skb)->last = skb;
4550        skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4551        skb->next = napi->gro_list;
4552        napi->gro_list = skb;
4553        ret = GRO_HELD;
4554
4555pull:
4556        grow = skb_gro_offset(skb) - skb_headlen(skb);
4557        if (grow > 0)
4558                gro_pull_from_frag0(skb, grow);
4559ok:
4560        return ret;
4561
4562normal:
4563        ret = GRO_NORMAL;
4564        goto pull;
4565}
4566
4567struct packet_offload *gro_find_receive_by_type(__be16 type)
4568{
4569        struct list_head *offload_head = &offload_base;
4570        struct packet_offload *ptype;
4571
4572        list_for_each_entry_rcu(ptype, offload_head, list) {
4573                if (ptype->type != type || !ptype->callbacks.gro_receive)
4574                        continue;
4575                return ptype;
4576        }
4577        return NULL;
4578}
4579EXPORT_SYMBOL(gro_find_receive_by_type);
4580
4581struct packet_offload *gro_find_complete_by_type(__be16 type)
4582{
4583        struct list_head *offload_head = &offload_base;
4584        struct packet_offload *ptype;
4585
4586        list_for_each_entry_rcu(ptype, offload_head, list) {
4587                if (ptype->type != type || !ptype->callbacks.gro_complete)
4588                        continue;
4589                return ptype;
4590        }
4591        return NULL;
4592}
4593EXPORT_SYMBOL(gro_find_complete_by_type);
4594
4595static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4596{
4597        switch (ret) {
4598        case GRO_NORMAL:
4599                if (netif_receive_skb_internal(skb))
4600                        ret = GRO_DROP;
4601                break;
4602
4603        case GRO_DROP:
4604                kfree_skb(skb);
4605                break;
4606
4607        case GRO_MERGED_FREE:
4608                if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4609                        skb_dst_drop(skb);
4610                        kmem_cache_free(skbuff_head_cache, skb);
4611                } else {
4612                        __kfree_skb(skb);
4613                }
4614                break;
4615
4616        case GRO_HELD:
4617        case GRO_MERGED:
4618                break;
4619        }
4620
4621        return ret;
4622}
4623
4624gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4625{
4626        skb_mark_napi_id(skb, napi);
4627        trace_napi_gro_receive_entry(skb);
4628
4629        skb_gro_reset_offset(skb);
4630
4631        return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4632}
4633EXPORT_SYMBOL(napi_gro_receive);
4634
4635static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4636{
4637        if (unlikely(skb->pfmemalloc)) {
4638                consume_skb(skb);
4639                return;
4640        }
4641        __skb_pull(skb, skb_headlen(skb));
4642        /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4643        skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4644        skb->vlan_tci = 0;
4645        skb->dev = napi->dev;
4646        skb->skb_iif = 0;
4647        skb->encapsulation = 0;
4648        skb_shinfo(skb)->gso_type = 0;
4649        skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4650
4651        napi->skb = skb;
4652}
4653
4654struct sk_buff *napi_get_frags(struct napi_struct *napi)
4655{
4656        struct sk_buff *skb = napi->skb;
4657
4658        if (!skb) {
4659                skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4660                if (skb) {
4661                        napi->skb = skb;
4662                        skb_mark_napi_id(skb, napi);
4663                }
4664        }
4665        return skb;
4666}
4667EXPORT_SYMBOL(napi_get_frags);
4668
4669static gro_result_t napi_frags_finish(struct napi_struct *napi,
4670                                      struct sk_buff *skb,
4671                                      gro_result_t ret)
4672{
4673        switch (ret) {
4674        case GRO_NORMAL:
4675        case GRO_HELD:
4676                __skb_push(skb, ETH_HLEN);
4677                skb->protocol = eth_type_trans(skb, skb->dev);
4678                if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4679                        ret = GRO_DROP;
4680                break;
4681
4682        case GRO_DROP:
4683        case GRO_MERGED_FREE:
4684                napi_reuse_skb(napi, skb);
4685                break;
4686
4687        case GRO_MERGED:
4688                break;
4689        }
4690
4691        return ret;
4692}
4693
4694/* Upper GRO stack assumes network header starts at gro_offset=0
4695 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4696 * We copy ethernet header into skb->data to have a common layout.
4697 */
4698static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4699{
4700        struct sk_buff *skb = napi->skb;
4701        const struct ethhdr *eth;
4702        unsigned int hlen = sizeof(*eth);
4703
4704        napi->skb = NULL;
4705
4706        skb_reset_mac_header(skb);
4707        skb_gro_reset_offset(skb);
4708
4709        eth = skb_gro_header_fast(skb, 0);
4710        if (unlikely(skb_gro_header_hard(skb, hlen))) {
4711                eth = skb_gro_header_slow(skb, hlen, 0);
4712                if (unlikely(!eth)) {
4713                        net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4714                                             __func__, napi->dev->name);
4715                        napi_reuse_skb(napi, skb);
4716                        return NULL;
4717                }
4718        } else {
4719                gro_pull_from_frag0(skb, hlen);
4720                NAPI_GRO_CB(skb)->frag0 += hlen;
4721                NAPI_GRO_CB(skb)->frag0_len -= hlen;
4722        }
4723        __skb_pull(skb, hlen);
4724
4725        /*
4726         * This works because the only protocols we care about don't require
4727         * special handling.
4728         * We'll fix it up properly in napi_frags_finish()
4729         */
4730        skb->protocol = eth->h_proto;
4731
4732        return skb;
4733}
4734
4735gro_result_t napi_gro_frags(struct napi_struct *napi)
4736{
4737        struct sk_buff *skb = napi_frags_skb(napi);
4738
4739        if (!skb)
4740                return GRO_DROP;
4741
4742        trace_napi_gro_frags_entry(skb);
4743
4744        return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4745}
4746EXPORT_SYMBOL(napi_gro_frags);
4747
4748/* Compute the checksum from gro_offset and return the folded value
4749 * after adding in any pseudo checksum.
4750 */
4751__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4752{
4753        __wsum wsum;
4754        __sum16 sum;
4755
4756        wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4757
4758        /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4759        sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4760        if (likely(!sum)) {
4761                if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4762                    !skb->csum_complete_sw)
4763                        netdev_rx_csum_fault(skb->dev);
4764        }
4765
4766        NAPI_GRO_CB(skb)->csum = wsum;
4767        NAPI_GRO_CB(skb)->csum_valid = 1;
4768
4769        return sum;
4770}
4771EXPORT_SYMBOL(__skb_gro_checksum_complete);
4772
4773/*
4774 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4775 * Note: called with local irq disabled, but exits with local irq enabled.
4776 */
4777static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4778{
4779#ifdef CONFIG_RPS
4780        struct softnet_data *remsd = sd->rps_ipi_list;
4781
4782        if (remsd) {
4783                sd->rps_ipi_list = NULL;
4784
4785                local_irq_enable();
4786
4787                /* Send pending IPI's to kick RPS processing on remote cpus. */
4788                while (remsd) {
4789                        struct softnet_data *next = remsd->rps_ipi_next;
4790
4791                        if (cpu_online(remsd->cpu))
4792                                smp_call_function_single_async(remsd->cpu,
4793                                                           &remsd->csd);
4794                        remsd = next;
4795                }
4796        } else
4797#endif
4798                local_irq_enable();
4799}
4800
4801static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4802{
4803#ifdef CONFIG_RPS
4804        return sd->rps_ipi_list != NULL;
4805#else
4806        return false;
4807#endif
4808}
4809
4810static int process_backlog(struct napi_struct *napi, int quota)
4811{
4812        int work = 0;
4813        struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4814
4815        /* Check if we have pending ipi, its better to send them now,
4816         * not waiting net_rx_action() end.
4817         */
4818        if (sd_has_rps_ipi_waiting(sd)) {
4819                local_irq_disable();
4820                net_rps_action_and_irq_enable(sd);
4821        }
4822
4823        napi->weight = weight_p;
4824        local_irq_disable();
4825        while (1) {
4826                struct sk_buff *skb;
4827
4828                while ((skb = __skb_dequeue(&sd->process_queue))) {
4829                        rcu_read_lock();
4830                        local_irq_enable();
4831                        __netif_receive_skb(skb);
4832                        rcu_read_unlock();
4833                        local_irq_disable();
4834                        input_queue_head_incr(sd);
4835                        if (++work >= quota) {
4836                                local_irq_enable();
4837                                return work;
4838                        }
4839                }
4840
4841                rps_lock(sd);
4842                if (skb_queue_empty(&sd->input_pkt_queue)) {
4843                        /*
4844                         * Inline a custom version of __napi_complete().
4845                         * only current cpu owns and manipulates this napi,
4846                         * and NAPI_STATE_SCHED is the only possible flag set
4847                         * on backlog.
4848                         * We can use a plain write instead of clear_bit(),
4849                         * and we dont need an smp_mb() memory barrier.
4850                         */
4851                        napi->state = 0;
4852                        rps_unlock(sd);
4853
4854                        break;
4855                }
4856
4857                skb_queue_splice_tail_init(&sd->input_pkt_queue,
4858                                           &sd->process_queue);
4859                rps_unlock(sd);
4860        }
4861        local_irq_enable();
4862
4863        return work;
4864}
4865
4866/**
4867 * __napi_schedule - schedule for receive
4868 * @n: entry to schedule
4869 *
4870 * The entry's receive function will be scheduled to run.
4871 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4872 */
4873void __napi_schedule(struct napi_struct *n)
4874{
4875        unsigned long flags;
4876
4877        local_irq_save(flags);
4878        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4879        local_irq_restore(flags);
4880}
4881EXPORT_SYMBOL(__napi_schedule);
4882
4883/**
4884 * __napi_schedule_irqoff - schedule for receive
4885 * @n: entry to schedule
4886 *
4887 * Variant of __napi_schedule() assuming hard irqs are masked
4888 */
4889void __napi_schedule_irqoff(struct napi_struct *n)
4890{
4891        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4892}
4893EXPORT_SYMBOL(__napi_schedule_irqoff);
4894
4895void __napi_complete(struct napi_struct *n)
4896{
4897        BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4898
4899        list_del_init(&n->poll_list);
4900        smp_mb__before_atomic();
4901        clear_bit(NAPI_STATE_SCHED, &n->state);
4902}
4903EXPORT_SYMBOL(__napi_complete);
4904
4905void napi_complete_done(struct napi_struct *n, int work_done)
4906{
4907        unsigned long flags;
4908
4909        /*
4910         * don't let napi dequeue from the cpu poll list
4911         * just in case its running on a different cpu
4912         */
4913        if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4914                return;
4915
4916        if (n->gro_list) {
4917                unsigned long timeout = 0;
4918
4919                if (work_done)
4920                        timeout = n->dev->gro_flush_timeout;
4921
4922                if (timeout)
4923                        hrtimer_start(&n->timer, ns_to_ktime(timeout),
4924                                      HRTIMER_MODE_REL_PINNED);
4925                else
4926                        napi_gro_flush(n, false);
4927        }
4928        if (likely(list_empty(&n->poll_list))) {
4929                WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4930        } else {
4931                /* If n->poll_list is not empty, we need to mask irqs */
4932                local_irq_save(flags);
4933                __napi_complete(n);
4934                local_irq_restore(flags);
4935        }
4936}
4937EXPORT_SYMBOL(napi_complete_done);
4938
4939/* must be called under rcu_read_lock(), as we dont take a reference */
4940static struct napi_struct *napi_by_id(unsigned int napi_id)
4941{
4942        unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4943        struct napi_struct *napi;
4944
4945        hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4946                if (napi->napi_id == napi_id)
4947                        return napi;
4948
4949        return NULL;
4950}
4951
4952#if defined(CONFIG_NET_RX_BUSY_POLL)
4953#define BUSY_POLL_BUDGET 8
4954bool sk_busy_loop(struct sock *sk, int nonblock)
4955{
4956        unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
4957        int (*busy_poll)(struct napi_struct *dev);
4958        struct napi_struct *napi;
4959        int rc = false;
4960
4961        rcu_read_lock();
4962
4963        napi = napi_by_id(sk->sk_napi_id);
4964        if (!napi)
4965                goto out;
4966
4967        /* Note: ndo_busy_poll method is optional in linux-4.5 */
4968        busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
4969
4970        do {
4971                rc = 0;
4972                local_bh_disable();
4973                if (busy_poll) {
4974                        rc = busy_poll(napi);
4975                } else if (napi_schedule_prep(napi)) {
4976                        void *have = netpoll_poll_lock(napi);
4977
4978                        if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
4979                                rc = napi->poll(napi, BUSY_POLL_BUDGET);
4980                                trace_napi_poll(napi);
4981                                if (rc == BUSY_POLL_BUDGET) {
4982                                        napi_complete_done(napi, rc);
4983                                        napi_schedule(napi);
4984                                }
4985                        }
4986                        netpoll_poll_unlock(have);
4987                }
4988                if (rc > 0)
4989                        __NET_ADD_STATS(sock_net(sk),
4990                                        LINUX_MIB_BUSYPOLLRXPACKETS, rc);
4991                local_bh_enable();
4992
4993                if (rc == LL_FLUSH_FAILED)
4994                        break; /* permanent failure */
4995
4996                cpu_relax();
4997        } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
4998                 !need_resched() && !busy_loop_timeout(end_time));
4999
5000        rc = !skb_queue_empty(&sk->sk_receive_queue);
5001out:
5002        rcu_read_unlock();
5003        return rc;
5004}
5005EXPORT_SYMBOL(sk_busy_loop);
5006
5007#endif /* CONFIG_NET_RX_BUSY_POLL */
5008
5009void napi_hash_add(struct napi_struct *napi)
5010{
5011        if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5012            test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5013                return;
5014
5015        spin_lock(&napi_hash_lock);
5016
5017        /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5018        do {
5019                if (unlikely(++napi_gen_id < NR_CPUS + 1))
5020                        napi_gen_id = NR_CPUS + 1;
5021        } while (napi_by_id(napi_gen_id));
5022        napi->napi_id = napi_gen_id;
5023
5024        hlist_add_head_rcu(&napi->napi_hash_node,
5025                           &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5026
5027        spin_unlock(&napi_hash_lock);
5028}
5029EXPORT_SYMBOL_GPL(napi_hash_add);
5030
5031/* Warning : caller is responsible to make sure rcu grace period
5032 * is respected before freeing memory containing @napi
5033 */
5034bool napi_hash_del(struct napi_struct *napi)
5035{
5036        bool rcu_sync_needed = false;
5037
5038        spin_lock(&napi_hash_lock);
5039
5040        if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5041                rcu_sync_needed = true;
5042                hlist_del_rcu(&napi->napi_hash_node);
5043        }
5044        spin_unlock(&napi_hash_lock);
5045        return rcu_sync_needed;
5046}
5047EXPORT_SYMBOL_GPL(napi_hash_del);
5048
5049static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5050{
5051        struct napi_struct *napi;
5052
5053        napi = container_of(timer, struct napi_struct, timer);
5054        if (napi->gro_list)
5055                napi_schedule(napi);
5056
5057        return HRTIMER_NORESTART;
5058}
5059
5060void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5061                    int (*poll)(struct napi_struct *, int), int weight)
5062{
5063        INIT_LIST_HEAD(&napi->poll_list);
5064        hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5065        napi->timer.function = napi_watchdog;
5066        napi->gro_count = 0;
5067        napi->gro_list = NULL;
5068        napi->skb = NULL;
5069        napi->poll = poll;
5070        if (weight > NAPI_POLL_WEIGHT)
5071                pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5072                            weight, dev->name);
5073        napi->weight = weight;
5074        list_add(&napi->dev_list, &dev->napi_list);
5075        napi->dev = dev;
5076#ifdef CONFIG_NETPOLL
5077        spin_lock_init(&napi->poll_lock);
5078        napi->poll_owner = -1;
5079#endif
5080        set_bit(NAPI_STATE_SCHED, &napi->state);
5081        napi_hash_add(napi);
5082}
5083EXPORT_SYMBOL(netif_napi_add);
5084
5085void napi_disable(struct napi_struct *n)
5086{
5087        might_sleep();
5088        set_bit(NAPI_STATE_DISABLE, &n->state);
5089
5090        while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5091                msleep(1);
5092        while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5093                msleep(1);
5094
5095        hrtimer_cancel(&n->timer);
5096
5097        clear_bit(NAPI_STATE_DISABLE, &n->state);
5098}
5099EXPORT_SYMBOL(napi_disable);
5100
5101/* Must be called in process context */
5102void netif_napi_del(struct napi_struct *napi)
5103{
5104        might_sleep();
5105        if (napi_hash_del(napi))
5106                synchronize_net();
5107        list_del_init(&napi->dev_list);
5108        napi_free_frags(napi);
5109
5110        kfree_skb_list(napi->gro_list);
5111        napi->gro_list = NULL;
5112        napi->gro_count = 0;
5113}
5114EXPORT_SYMBOL(netif_napi_del);
5115
5116static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5117{
5118        void *have;
5119        int work, weight;
5120
5121        list_del_init(&n->poll_list);
5122
5123        have = netpoll_poll_lock(n);
5124
5125        weight = n->weight;
5126
5127        /* This NAPI_STATE_SCHED test is for avoiding a race
5128         * with netpoll's poll_napi().  Only the entity which
5129         * obtains the lock and sees NAPI_STATE_SCHED set will
5130         * actually make the ->poll() call.  Therefore we avoid
5131         * accidentally calling ->poll() when NAPI is not scheduled.
5132         */
5133        work = 0;
5134        if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5135                work = n->poll(n, weight);
5136                trace_napi_poll(n);
5137        }
5138
5139        WARN_ON_ONCE(work > weight);
5140
5141        if (likely(work < weight))
5142                goto out_unlock;
5143
5144        /* Drivers must not modify the NAPI state if they
5145         * consume the entire weight.  In such cases this code
5146         * still "owns" the NAPI instance and therefore can
5147         * move the instance around on the list at-will.
5148         */
5149        if (unlikely(napi_disable_pending(n))) {
5150                napi_complete(n);
5151                goto out_unlock;
5152        }
5153
5154        if (n->gro_list) {
5155                /* flush too old packets
5156                 * If HZ < 1000, flush all packets.
5157                 */
5158                napi_gro_flush(n, HZ >= 1000);
5159        }
5160
5161        /* Some drivers may have called napi_schedule
5162         * prior to exhausting their budget.
5163         */
5164        if (unlikely(!list_empty(&n->poll_list))) {
5165                pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5166                             n->dev ? n->dev->name : "backlog");
5167                goto out_unlock;
5168        }
5169
5170        list_add_tail(&n->poll_list, repoll);
5171
5172out_unlock:
5173        netpoll_poll_unlock(have);
5174
5175        return work;
5176}
5177
5178static void net_rx_action(struct softirq_action *h)
5179{
5180        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5181        unsigned long time_limit = jiffies + 2;
5182        int budget = netdev_budget;
5183        LIST_HEAD(list);
5184        LIST_HEAD(repoll);
5185
5186        local_irq_disable();
5187        list_splice_init(&sd->poll_list, &list);
5188        local_irq_enable();
5189
5190        for (;;) {
5191                struct napi_struct *n;
5192
5193                if (list_empty(&list)) {
5194                        if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5195                                return;
5196                        break;
5197                }
5198
5199                n = list_first_entry(&list, struct napi_struct, poll_list);
5200                budget -= napi_poll(n, &repoll);
5201
5202                /* If softirq window is exhausted then punt.
5203                 * Allow this to run for 2 jiffies since which will allow
5204                 * an average latency of 1.5/HZ.
5205                 */
5206                if (unlikely(budget <= 0 ||
5207                             time_after_eq(jiffies, time_limit))) {
5208                        sd->time_squeeze++;
5209                        break;
5210                }
5211        }
5212
5213        __kfree_skb_flush();
5214        local_irq_disable();
5215
5216        list_splice_tail_init(&sd->poll_list, &list);
5217        list_splice_tail(&repoll, &list);
5218        list_splice(&list, &sd->poll_list);
5219        if (!list_empty(&sd->poll_list))
5220                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
5221
5222        net_rps_action_and_irq_enable(sd);
5223}
5224
5225struct netdev_adjacent {
5226        struct net_device *dev;
5227
5228        /* upper master flag, there can only be one master device per list */
5229        bool master;
5230
5231        /* counter for the number of times this device was added to us */
5232        u16 ref_nr;
5233
5234        /* private field for the users */
5235        void *private;
5236
5237        struct list_head list;
5238        struct rcu_head rcu;
5239};
5240
5241static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5242                                                 struct list_head *adj_list)
5243{
5244        struct netdev_adjacent *adj;
5245
5246        list_for_each_entry(adj, adj_list, list) {
5247                if (adj->dev == adj_dev)
5248                        return adj;
5249        }
5250        return NULL;
5251}
5252
5253/**
5254 * netdev_has_upper_dev - Check if device is linked to an upper device
5255 * @dev: device
5256 * @upper_dev: upper device to check
5257 *
5258 * Find out if a device is linked to specified upper device and return true
5259 * in case it is. Note that this checks only immediate upper device,
5260 * not through a complete stack of devices. The caller must hold the RTNL lock.
5261 */
5262bool netdev_has_upper_dev(struct net_device *dev,
5263                          struct net_device *upper_dev)
5264{
5265        ASSERT_RTNL();
5266
5267        return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
5268}
5269EXPORT_SYMBOL(netdev_has_upper_dev);
5270
5271/**
5272 * netdev_has_any_upper_dev - Check if device is linked to some device
5273 * @dev: device
5274 *
5275 * Find out if a device is linked to an upper device and return true in case
5276 * it is. The caller must hold the RTNL lock.
5277 */
5278static bool netdev_has_any_upper_dev(struct net_device *dev)
5279{
5280        ASSERT_RTNL();
5281
5282        return !list_empty(&dev->all_adj_list.upper);
5283}
5284
5285/**
5286 * netdev_master_upper_dev_get - Get master upper device
5287 * @dev: device
5288 *
5289 * Find a master upper device and return pointer to it or NULL in case
5290 * it's not there. The caller must hold the RTNL lock.
5291 */
5292struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5293{
5294        struct netdev_adjacent *upper;
5295
5296        ASSERT_RTNL();
5297
5298        if (list_empty(&dev->adj_list.upper))
5299                return NULL;
5300
5301        upper = list_first_entry(&dev->adj_list.upper,
5302                                 struct netdev_adjacent, list);
5303        if (likely(upper->master))
5304                return upper->dev;
5305        return NULL;
5306}
5307EXPORT_SYMBOL(netdev_master_upper_dev_get);
5308
5309void *netdev_adjacent_get_private(struct list_head *adj_list)
5310{
5311        struct netdev_adjacent *adj;
5312
5313        adj = list_entry(adj_list, struct netdev_adjacent, list);
5314
5315        return adj->private;
5316}
5317EXPORT_SYMBOL(netdev_adjacent_get_private);
5318
5319/**
5320 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5321 * @dev: device
5322 * @iter: list_head ** of the current position
5323 *
5324 * Gets the next device from the dev's upper list, starting from iter
5325 * position. The caller must hold RCU read lock.
5326 */
5327struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5328                                                 struct list_head **iter)
5329{
5330        struct netdev_adjacent *upper;
5331
5332        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5333
5334        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5335
5336        if (&upper->list == &dev->adj_list.upper)
5337                return NULL;
5338
5339        *iter = &upper->list;
5340
5341        return upper->dev;
5342}
5343EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5344
5345/**
5346 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
5347 * @dev: device
5348 * @iter: list_head ** of the current position
5349 *
5350 * Gets the next device from the dev's upper list, starting from iter
5351 * position. The caller must hold RCU read lock.
5352 */
5353struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5354                                                     struct list_head **iter)
5355{
5356        struct netdev_adjacent *upper;
5357
5358        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5359
5360        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5361
5362        if (&upper->list == &dev->all_adj_list.upper)
5363                return NULL;
5364
5365        *iter = &upper->list;
5366
5367        return upper->dev;
5368}
5369EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
5370
5371/**
5372 * netdev_lower_get_next_private - Get the next ->private from the
5373 *                                 lower neighbour list
5374 * @dev: device
5375 * @iter: list_head ** of the current position
5376 *
5377 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5378 * list, starting from iter position. The caller must hold either hold the
5379 * RTNL lock or its own locking that guarantees that the neighbour lower
5380 * list will remain unchanged.
5381 */
5382void *netdev_lower_get_next_private(struct net_device *dev,
5383                                    struct list_head **iter)
5384{
5385        struct netdev_adjacent *lower;
5386
5387        lower = list_entry(*iter, struct netdev_adjacent, list);
5388
5389        if (&lower->list == &dev->adj_list.lower)
5390                return NULL;
5391
5392        *iter = lower->list.next;
5393
5394        return lower->private;
5395}
5396EXPORT_SYMBOL(netdev_lower_get_next_private);
5397
5398/**
5399 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5400 *                                     lower neighbour list, RCU
5401 *                                     variant
5402 * @dev: device
5403 * @iter: list_head ** of the current position
5404 *
5405 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5406 * list, starting from iter position. The caller must hold RCU read lock.
5407 */
5408void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5409                                        struct list_head **iter)
5410{
5411        struct netdev_adjacent *lower;
5412
5413        WARN_ON_ONCE(!rcu_read_lock_held());
5414
5415        lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5416
5417        if (&lower->list == &dev->adj_list.lower)
5418                return NULL;
5419
5420        *iter = &lower->list;
5421
5422        return lower->private;
5423}
5424EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5425
5426/**
5427 * netdev_lower_get_next - Get the next device from the lower neighbour
5428 *                         list
5429 * @dev: device
5430 * @iter: list_head ** of the current position
5431 *
5432 * Gets the next netdev_adjacent from the dev's lower neighbour
5433 * list, starting from iter position. The caller must hold RTNL lock or
5434 * its own locking that guarantees that the neighbour lower
5435 * list will remain unchanged.
5436 */
5437void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5438{
5439        struct netdev_adjacent *lower;
5440
5441        lower = list_entry(*iter, struct netdev_adjacent, list);
5442
5443        if (&lower->list == &dev->adj_list.lower)
5444                return NULL;
5445
5446        *iter = lower->list.next;
5447
5448        return lower->dev;
5449}
5450EXPORT_SYMBOL(netdev_lower_get_next);
5451
5452/**
5453 * netdev_lower_get_first_private_rcu - Get the first ->private from the
5454 *                                     lower neighbour list, RCU
5455 *                                     variant
5456 * @dev: device
5457 *
5458 * Gets the first netdev_adjacent->private from the dev's lower neighbour
5459 * list. The caller must hold RCU read lock.
5460 */
5461void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5462{
5463        struct netdev_adjacent *lower;
5464
5465        lower = list_first_or_null_rcu(&dev->adj_list.lower,
5466                        struct netdev_adjacent, list);
5467        if (lower)
5468                return lower->private;
5469        return NULL;
5470}
5471EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5472
5473/**
5474 * netdev_master_upper_dev_get_rcu - Get master upper device
5475 * @dev: device
5476 *
5477 * Find a master upper device and return pointer to it or NULL in case
5478 * it's not there. The caller must hold the RCU read lock.
5479 */
5480struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5481{
5482        struct netdev_adjacent *upper;
5483
5484        upper = list_first_or_null_rcu(&dev->adj_list.upper,
5485                                       struct netdev_adjacent, list);
5486        if (upper && likely(upper->master))
5487                return upper->dev;
5488        return NULL;
5489}
5490EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5491
5492static int netdev_adjacent_sysfs_add(struct net_device *dev,
5493                              struct net_device *adj_dev,
5494                              struct list_head *dev_list)
5495{
5496        char linkname[IFNAMSIZ+7];
5497        sprintf(linkname, dev_list == &dev->adj_list.upper ?
5498                "upper_%s" : "lower_%s", adj_dev->name);
5499        return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5500                                 linkname);
5501}
5502static void netdev_adjacent_sysfs_del(struct net_device *dev,
5503                               char *name,
5504                               struct list_head *dev_list)
5505{
5506        char linkname[IFNAMSIZ+7];
5507        sprintf(linkname, dev_list == &dev->adj_list.upper ?
5508                "upper_%s" : "lower_%s", name);
5509        sysfs_remove_link(&(dev->dev.kobj), linkname);
5510}
5511
5512static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5513                                                 struct net_device *adj_dev,
5514                                                 struct list_head *dev_list)
5515{
5516        return (dev_list == &dev->adj_list.upper ||
5517                dev_list == &dev->adj_list.lower) &&
5518                net_eq(dev_net(dev), dev_net(adj_dev));
5519}
5520
5521static int __netdev_adjacent_dev_insert(struct net_device *dev,
5522                                        struct net_device *adj_dev,
5523                                        struct list_head *dev_list,
5524                                        void *private, bool master)
5525{
5526        struct netdev_adjacent *adj;
5527        int ret;
5528
5529        adj = __netdev_find_adj(adj_dev, dev_list);
5530
5531        if (adj) {
5532                adj->ref_nr++;
5533                return 0;
5534        }
5535
5536        adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5537        if (!adj)
5538                return -ENOMEM;
5539
5540        adj->dev = adj_dev;
5541        adj->master = master;
5542        adj->ref_nr = 1;
5543        adj->private = private;
5544        dev_hold(adj_dev);
5545
5546        pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5547                 adj_dev->name, dev->name, adj_dev->name);
5548
5549        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5550                ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5551                if (ret)
5552                        goto free_adj;
5553        }
5554
5555        /* Ensure that master link is always the first item in list. */
5556        if (master) {
5557                ret = sysfs_create_link(&(dev->dev.kobj),
5558                                        &(adj_dev->dev.kobj), "master");
5559                if (ret)
5560                        goto remove_symlinks;
5561
5562                list_add_rcu(&adj->list, dev_list);
5563        } else {
5564                list_add_tail_rcu(&adj->list, dev_list);
5565        }
5566
5567        return 0;
5568
5569remove_symlinks:
5570        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5571                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5572free_adj:
5573        kfree(adj);
5574        dev_put(adj_dev);
5575
5576        return ret;
5577}
5578
5579static void __netdev_adjacent_dev_remove(struct net_device *dev,
5580                                         struct net_device *adj_dev,
5581                                         struct list_head *dev_list)
5582{
5583        struct netdev_adjacent *adj;
5584
5585        adj = __netdev_find_adj(adj_dev, dev_list);
5586
5587        if (!adj) {
5588                pr_err("tried to remove device %s from %s\n",
5589                       dev->name, adj_dev->name);
5590                BUG();
5591        }
5592
5593        if (adj->ref_nr > 1) {
5594                pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5595                         adj->ref_nr-1);
5596                adj->ref_nr--;
5597                return;
5598        }
5599
5600        if (adj->master)
5601                sysfs_remove_link(&(dev->dev.kobj), "master");
5602
5603        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5604                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5605
5606        list_del_rcu(&adj->list);
5607        pr_debug("dev_put for %s, because link removed from %s to %s\n",
5608                 adj_dev->name, dev->name, adj_dev->name);
5609        dev_put(adj_dev);
5610        kfree_rcu(adj, rcu);
5611}
5612
5613static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5614                                            struct net_device *upper_dev,
5615                                            struct list_head *up_list,
5616                                            struct list_head *down_list,
5617                                            void *private, bool master)
5618{
5619        int ret;
5620
5621        ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5622                                           master);
5623        if (ret)
5624                return ret;
5625
5626        ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5627                                           false);
5628        if (ret) {
5629                __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5630                return ret;
5631        }
5632
5633        return 0;
5634}
5635
5636static int __netdev_adjacent_dev_link(struct net_device *dev,
5637                                      struct net_device *upper_dev)
5638{
5639        return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5640                                                &dev->all_adj_list.upper,
5641                                                &upper_dev->all_adj_list.lower,
5642                                                NULL, false);
5643}
5644
5645static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5646                                               struct net_device *upper_dev,
5647                                               struct list_head *up_list,
5648                                               struct list_head *down_list)
5649{
5650        __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5651        __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5652}
5653
5654static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5655                                         struct net_device *upper_dev)
5656{
5657        __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5658                                           &dev->all_adj_list.upper,
5659                                           &upper_dev->all_adj_list.lower);
5660}
5661
5662static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5663                                                struct net_device *upper_dev,
5664                                                void *private, bool master)
5665{
5666        int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5667
5668        if (ret)
5669                return ret;
5670
5671        ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5672                                               &dev->adj_list.upper,
5673                                               &upper_dev->adj_list.lower,
5674                                               private, master);
5675        if (ret) {