linux/net/core/dev.c
<<
>>
Prefs
   1/*
   2 *      NET3    Protocol independent device support routines.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 *      Derived from the non IP parts of dev.c 1.0.19
  10 *              Authors:        Ross Biro
  11 *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *      Additional Authors:
  15 *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *              David Hinds <dahinds@users.sourceforge.net>
  18 *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *              Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *      Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *                                      to 2 if register_netdev gets called
  25 *                                      before net_dev_init & also removed a
  26 *                                      few lines of code in the process.
  27 *              Alan Cox        :       device private ioctl copies fields back.
  28 *              Alan Cox        :       Transmit queue code does relevant
  29 *                                      stunts to keep the queue safe.
  30 *              Alan Cox        :       Fixed double lock.
  31 *              Alan Cox        :       Fixed promisc NULL pointer trap
  32 *              ????????        :       Support the full private ioctl range
  33 *              Alan Cox        :       Moved ioctl permission check into
  34 *                                      drivers
  35 *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36 *              Alan Cox        :       100 backlog just doesn't cut it when
  37 *                                      you start doing multicast video 8)
  38 *              Alan Cox        :       Rewrote net_bh and list manager.
  39 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40 *              Alan Cox        :       Took out transmit every packet pass
  41 *                                      Saved a few bytes in the ioctl handler
  42 *              Alan Cox        :       Network driver sets packet type before
  43 *                                      calling netif_rx. Saves a function
  44 *                                      call a packet.
  45 *              Alan Cox        :       Hashed net_bh()
  46 *              Richard Kooijman:       Timestamp fixes.
  47 *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48 *              Alan Cox        :       Device lock protection.
  49 *              Alan Cox        :       Fixed nasty side effect of device close
  50 *                                      changes.
  51 *              Rudi Cilibrasi  :       Pass the right thing to
  52 *                                      set_mac_address()
  53 *              Dave Miller     :       32bit quantity for the device lock to
  54 *                                      make it work out on a Sparc.
  55 *              Bjorn Ekwall    :       Added KERNELD hack.
  56 *              Alan Cox        :       Cleaned up the backlog initialise.
  57 *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58 *                                      1 device.
  59 *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60 *                                      is no device open function.
  61 *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62 *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63 *              Cyrus Durgin    :       Cleaned for KMOD
  64 *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65 *                                      A network device unload needs to purge
  66 *                                      the backlog queue.
  67 *      Paul Rusty Russell      :       SIOCSIFNAME
  68 *              Pekka Riikonen  :       Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *                                      indefinitely on dev->refcnt
  71 *              J Hadi Salim    :       - Backlog queue sampling
  72 *                                      - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
  84#include <linux/mutex.h>
  85#include <linux/string.h>
  86#include <linux/mm.h>
  87#include <linux/socket.h>
  88#include <linux/sockios.h>
  89#include <linux/errno.h>
  90#include <linux/interrupt.h>
  91#include <linux/if_ether.h>
  92#include <linux/netdevice.h>
  93#include <linux/etherdevice.h>
  94#include <linux/ethtool.h>
  95#include <linux/notifier.h>
  96#include <linux/skbuff.h>
  97#include <net/net_namespace.h>
  98#include <net/sock.h>
  99#include <net/busy_poll.h>
 100#include <linux/rtnetlink.h>
 101#include <linux/stat.h>
 102#include <net/dst.h>
 103#include <net/dst_metadata.h>
 104#include <net/pkt_sched.h>
 105#include <net/checksum.h>
 106#include <net/xfrm.h>
 107#include <linux/highmem.h>
 108#include <linux/init.h>
 109#include <linux/module.h>
 110#include <linux/netpoll.h>
 111#include <linux/rcupdate.h>
 112#include <linux/delay.h>
 113#include <net/iw_handler.h>
 114#include <asm/current.h>
 115#include <linux/audit.h>
 116#include <linux/dmaengine.h>
 117#include <linux/err.h>
 118#include <linux/ctype.h>
 119#include <linux/if_arp.h>
 120#include <linux/if_vlan.h>
 121#include <linux/ip.h>
 122#include <net/ip.h>
 123#include <net/mpls.h>
 124#include <linux/ipv6.h>
 125#include <linux/in.h>
 126#include <linux/jhash.h>
 127#include <linux/random.h>
 128#include <trace/events/napi.h>
 129#include <trace/events/net.h>
 130#include <trace/events/skb.h>
 131#include <linux/pci.h>
 132#include <linux/inetdevice.h>
 133#include <linux/cpu_rmap.h>
 134#include <linux/static_key.h>
 135#include <linux/hashtable.h>
 136#include <linux/vmalloc.h>
 137#include <linux/if_macvlan.h>
 138#include <linux/errqueue.h>
 139#include <linux/hrtimer.h>
 140#include <linux/netfilter_ingress.h>
 141#include <linux/sctp.h>
 142
 143#include "net-sysfs.h"
 144
 145/* Instead of increasing this, you should create a hash table. */
 146#define MAX_GRO_SKBS 8
 147
 148/* This should be increased if a protocol with a bigger head is added. */
 149#define GRO_MAX_HEAD (MAX_HEADER + 128)
 150
 151static DEFINE_SPINLOCK(ptype_lock);
 152static DEFINE_SPINLOCK(offload_lock);
 153struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 154struct list_head ptype_all __read_mostly;       /* Taps */
 155static struct list_head offload_base __read_mostly;
 156
 157static int netif_rx_internal(struct sk_buff *skb);
 158static int call_netdevice_notifiers_info(unsigned long val,
 159                                         struct net_device *dev,
 160                                         struct netdev_notifier_info *info);
 161
 162/*
 163 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 164 * semaphore.
 165 *
 166 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 167 *
 168 * Writers must hold the rtnl semaphore while they loop through the
 169 * dev_base_head list, and hold dev_base_lock for writing when they do the
 170 * actual updates.  This allows pure readers to access the list even
 171 * while a writer is preparing to update it.
 172 *
 173 * To put it another way, dev_base_lock is held for writing only to
 174 * protect against pure readers; the rtnl semaphore provides the
 175 * protection against other writers.
 176 *
 177 * See, for example usages, register_netdevice() and
 178 * unregister_netdevice(), which must be called with the rtnl
 179 * semaphore held.
 180 */
 181DEFINE_RWLOCK(dev_base_lock);
 182EXPORT_SYMBOL(dev_base_lock);
 183
 184/* protects napi_hash addition/deletion and napi_gen_id */
 185static DEFINE_SPINLOCK(napi_hash_lock);
 186
 187static unsigned int napi_gen_id = NR_CPUS;
 188static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 189
 190static seqcount_t devnet_rename_seq;
 191
 192static inline void dev_base_seq_inc(struct net *net)
 193{
 194        while (++net->dev_base_seq == 0);
 195}
 196
 197static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 198{
 199        unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 200
 201        return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 202}
 203
 204static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 205{
 206        return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 207}
 208
 209static inline void rps_lock(struct softnet_data *sd)
 210{
 211#ifdef CONFIG_RPS
 212        spin_lock(&sd->input_pkt_queue.lock);
 213#endif
 214}
 215
 216static inline void rps_unlock(struct softnet_data *sd)
 217{
 218#ifdef CONFIG_RPS
 219        spin_unlock(&sd->input_pkt_queue.lock);
 220#endif
 221}
 222
 223/* Device list insertion */
 224static void list_netdevice(struct net_device *dev)
 225{
 226        struct net *net = dev_net(dev);
 227
 228        ASSERT_RTNL();
 229
 230        write_lock_bh(&dev_base_lock);
 231        list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 232        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 233        hlist_add_head_rcu(&dev->index_hlist,
 234                           dev_index_hash(net, dev->ifindex));
 235        write_unlock_bh(&dev_base_lock);
 236
 237        dev_base_seq_inc(net);
 238}
 239
 240/* Device list removal
 241 * caller must respect a RCU grace period before freeing/reusing dev
 242 */
 243static void unlist_netdevice(struct net_device *dev)
 244{
 245        ASSERT_RTNL();
 246
 247        /* Unlink dev from the device chain */
 248        write_lock_bh(&dev_base_lock);
 249        list_del_rcu(&dev->dev_list);
 250        hlist_del_rcu(&dev->name_hlist);
 251        hlist_del_rcu(&dev->index_hlist);
 252        write_unlock_bh(&dev_base_lock);
 253
 254        dev_base_seq_inc(dev_net(dev));
 255}
 256
 257/*
 258 *      Our notifier list
 259 */
 260
 261static RAW_NOTIFIER_HEAD(netdev_chain);
 262
 263/*
 264 *      Device drivers call our routines to queue packets here. We empty the
 265 *      queue in the local softnet handler.
 266 */
 267
 268DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 269EXPORT_PER_CPU_SYMBOL(softnet_data);
 270
 271#ifdef CONFIG_LOCKDEP
 272/*
 273 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 274 * according to dev->type
 275 */
 276static const unsigned short netdev_lock_type[] =
 277        {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 278         ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 279         ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 280         ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 281         ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 282         ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 283         ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 284         ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 285         ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 286         ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 287         ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 288         ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 289         ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 290         ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 291         ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 292
 293static const char *const netdev_lock_name[] =
 294        {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 295         "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 296         "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 297         "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 298         "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 299         "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 300         "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 301         "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 302         "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 303         "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 304         "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 305         "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 306         "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 307         "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 308         "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 309
 310static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 311static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 312
 313static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 314{
 315        int i;
 316
 317        for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 318                if (netdev_lock_type[i] == dev_type)
 319                        return i;
 320        /* the last key is used by default */
 321        return ARRAY_SIZE(netdev_lock_type) - 1;
 322}
 323
 324static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 325                                                 unsigned short dev_type)
 326{
 327        int i;
 328
 329        i = netdev_lock_pos(dev_type);
 330        lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 331                                   netdev_lock_name[i]);
 332}
 333
 334static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 335{
 336        int i;
 337
 338        i = netdev_lock_pos(dev->type);
 339        lockdep_set_class_and_name(&dev->addr_list_lock,
 340                                   &netdev_addr_lock_key[i],
 341                                   netdev_lock_name[i]);
 342}
 343#else
 344static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 345                                                 unsigned short dev_type)
 346{
 347}
 348static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 349{
 350}
 351#endif
 352
 353/*******************************************************************************
 354
 355                Protocol management and registration routines
 356
 357*******************************************************************************/
 358
 359/*
 360 *      Add a protocol ID to the list. Now that the input handler is
 361 *      smarter we can dispense with all the messy stuff that used to be
 362 *      here.
 363 *
 364 *      BEWARE!!! Protocol handlers, mangling input packets,
 365 *      MUST BE last in hash buckets and checking protocol handlers
 366 *      MUST start from promiscuous ptype_all chain in net_bh.
 367 *      It is true now, do not change it.
 368 *      Explanation follows: if protocol handler, mangling packet, will
 369 *      be the first on list, it is not able to sense, that packet
 370 *      is cloned and should be copied-on-write, so that it will
 371 *      change it and subsequent readers will get broken packet.
 372 *                                                      --ANK (980803)
 373 */
 374
 375static inline struct list_head *ptype_head(const struct packet_type *pt)
 376{
 377        if (pt->type == htons(ETH_P_ALL))
 378                return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 379        else
 380                return pt->dev ? &pt->dev->ptype_specific :
 381                                 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 382}
 383
 384/**
 385 *      dev_add_pack - add packet handler
 386 *      @pt: packet type declaration
 387 *
 388 *      Add a protocol handler to the networking stack. The passed &packet_type
 389 *      is linked into kernel lists and may not be freed until it has been
 390 *      removed from the kernel lists.
 391 *
 392 *      This call does not sleep therefore it can not
 393 *      guarantee all CPU's that are in middle of receiving packets
 394 *      will see the new packet type (until the next received packet).
 395 */
 396
 397void dev_add_pack(struct packet_type *pt)
 398{
 399        struct list_head *head = ptype_head(pt);
 400
 401        spin_lock(&ptype_lock);
 402        list_add_rcu(&pt->list, head);
 403        spin_unlock(&ptype_lock);
 404}
 405EXPORT_SYMBOL(dev_add_pack);
 406
 407/**
 408 *      __dev_remove_pack        - remove packet handler
 409 *      @pt: packet type declaration
 410 *
 411 *      Remove a protocol handler that was previously added to the kernel
 412 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 413 *      from the kernel lists and can be freed or reused once this function
 414 *      returns.
 415 *
 416 *      The packet type might still be in use by receivers
 417 *      and must not be freed until after all the CPU's have gone
 418 *      through a quiescent state.
 419 */
 420void __dev_remove_pack(struct packet_type *pt)
 421{
 422        struct list_head *head = ptype_head(pt);
 423        struct packet_type *pt1;
 424
 425        spin_lock(&ptype_lock);
 426
 427        list_for_each_entry(pt1, head, list) {
 428                if (pt == pt1) {
 429                        list_del_rcu(&pt->list);
 430                        goto out;
 431                }
 432        }
 433
 434        pr_warn("dev_remove_pack: %p not found\n", pt);
 435out:
 436        spin_unlock(&ptype_lock);
 437}
 438EXPORT_SYMBOL(__dev_remove_pack);
 439
 440/**
 441 *      dev_remove_pack  - remove packet handler
 442 *      @pt: packet type declaration
 443 *
 444 *      Remove a protocol handler that was previously added to the kernel
 445 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 446 *      from the kernel lists and can be freed or reused once this function
 447 *      returns.
 448 *
 449 *      This call sleeps to guarantee that no CPU is looking at the packet
 450 *      type after return.
 451 */
 452void dev_remove_pack(struct packet_type *pt)
 453{
 454        __dev_remove_pack(pt);
 455
 456        synchronize_net();
 457}
 458EXPORT_SYMBOL(dev_remove_pack);
 459
 460
 461/**
 462 *      dev_add_offload - register offload handlers
 463 *      @po: protocol offload declaration
 464 *
 465 *      Add protocol offload handlers to the networking stack. The passed
 466 *      &proto_offload is linked into kernel lists and may not be freed until
 467 *      it has been removed from the kernel lists.
 468 *
 469 *      This call does not sleep therefore it can not
 470 *      guarantee all CPU's that are in middle of receiving packets
 471 *      will see the new offload handlers (until the next received packet).
 472 */
 473void dev_add_offload(struct packet_offload *po)
 474{
 475        struct packet_offload *elem;
 476
 477        spin_lock(&offload_lock);
 478        list_for_each_entry(elem, &offload_base, list) {
 479                if (po->priority < elem->priority)
 480                        break;
 481        }
 482        list_add_rcu(&po->list, elem->list.prev);
 483        spin_unlock(&offload_lock);
 484}
 485EXPORT_SYMBOL(dev_add_offload);
 486
 487/**
 488 *      __dev_remove_offload     - remove offload handler
 489 *      @po: packet offload declaration
 490 *
 491 *      Remove a protocol offload handler that was previously added to the
 492 *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 493 *      is removed from the kernel lists and can be freed or reused once this
 494 *      function returns.
 495 *
 496 *      The packet type might still be in use by receivers
 497 *      and must not be freed until after all the CPU's have gone
 498 *      through a quiescent state.
 499 */
 500static void __dev_remove_offload(struct packet_offload *po)
 501{
 502        struct list_head *head = &offload_base;
 503        struct packet_offload *po1;
 504
 505        spin_lock(&offload_lock);
 506
 507        list_for_each_entry(po1, head, list) {
 508                if (po == po1) {
 509                        list_del_rcu(&po->list);
 510                        goto out;
 511                }
 512        }
 513
 514        pr_warn("dev_remove_offload: %p not found\n", po);
 515out:
 516        spin_unlock(&offload_lock);
 517}
 518
 519/**
 520 *      dev_remove_offload       - remove packet offload handler
 521 *      @po: packet offload declaration
 522 *
 523 *      Remove a packet offload handler that was previously added to the kernel
 524 *      offload handlers by dev_add_offload(). The passed &offload_type is
 525 *      removed from the kernel lists and can be freed or reused once this
 526 *      function returns.
 527 *
 528 *      This call sleeps to guarantee that no CPU is looking at the packet
 529 *      type after return.
 530 */
 531void dev_remove_offload(struct packet_offload *po)
 532{
 533        __dev_remove_offload(po);
 534
 535        synchronize_net();
 536}
 537EXPORT_SYMBOL(dev_remove_offload);
 538
 539/******************************************************************************
 540
 541                      Device Boot-time Settings Routines
 542
 543*******************************************************************************/
 544
 545/* Boot time configuration table */
 546static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 547
 548/**
 549 *      netdev_boot_setup_add   - add new setup entry
 550 *      @name: name of the device
 551 *      @map: configured settings for the device
 552 *
 553 *      Adds new setup entry to the dev_boot_setup list.  The function
 554 *      returns 0 on error and 1 on success.  This is a generic routine to
 555 *      all netdevices.
 556 */
 557static int netdev_boot_setup_add(char *name, struct ifmap *map)
 558{
 559        struct netdev_boot_setup *s;
 560        int i;
 561
 562        s = dev_boot_setup;
 563        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 564                if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 565                        memset(s[i].name, 0, sizeof(s[i].name));
 566                        strlcpy(s[i].name, name, IFNAMSIZ);
 567                        memcpy(&s[i].map, map, sizeof(s[i].map));
 568                        break;
 569                }
 570        }
 571
 572        return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 573}
 574
 575/**
 576 *      netdev_boot_setup_check - check boot time settings
 577 *      @dev: the netdevice
 578 *
 579 *      Check boot time settings for the device.
 580 *      The found settings are set for the device to be used
 581 *      later in the device probing.
 582 *      Returns 0 if no settings found, 1 if they are.
 583 */
 584int netdev_boot_setup_check(struct net_device *dev)
 585{
 586        struct netdev_boot_setup *s = dev_boot_setup;
 587        int i;
 588
 589        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 590                if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 591                    !strcmp(dev->name, s[i].name)) {
 592                        dev->irq        = s[i].map.irq;
 593                        dev->base_addr  = s[i].map.base_addr;
 594                        dev->mem_start  = s[i].map.mem_start;
 595                        dev->mem_end    = s[i].map.mem_end;
 596                        return 1;
 597                }
 598        }
 599        return 0;
 600}
 601EXPORT_SYMBOL(netdev_boot_setup_check);
 602
 603
 604/**
 605 *      netdev_boot_base        - get address from boot time settings
 606 *      @prefix: prefix for network device
 607 *      @unit: id for network device
 608 *
 609 *      Check boot time settings for the base address of device.
 610 *      The found settings are set for the device to be used
 611 *      later in the device probing.
 612 *      Returns 0 if no settings found.
 613 */
 614unsigned long netdev_boot_base(const char *prefix, int unit)
 615{
 616        const struct netdev_boot_setup *s = dev_boot_setup;
 617        char name[IFNAMSIZ];
 618        int i;
 619
 620        sprintf(name, "%s%d", prefix, unit);
 621
 622        /*
 623         * If device already registered then return base of 1
 624         * to indicate not to probe for this interface
 625         */
 626        if (__dev_get_by_name(&init_net, name))
 627                return 1;
 628
 629        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 630                if (!strcmp(name, s[i].name))
 631                        return s[i].map.base_addr;
 632        return 0;
 633}
 634
 635/*
 636 * Saves at boot time configured settings for any netdevice.
 637 */
 638int __init netdev_boot_setup(char *str)
 639{
 640        int ints[5];
 641        struct ifmap map;
 642
 643        str = get_options(str, ARRAY_SIZE(ints), ints);
 644        if (!str || !*str)
 645                return 0;
 646
 647        /* Save settings */
 648        memset(&map, 0, sizeof(map));
 649        if (ints[0] > 0)
 650                map.irq = ints[1];
 651        if (ints[0] > 1)
 652                map.base_addr = ints[2];
 653        if (ints[0] > 2)
 654                map.mem_start = ints[3];
 655        if (ints[0] > 3)
 656                map.mem_end = ints[4];
 657
 658        /* Add new entry to the list */
 659        return netdev_boot_setup_add(str, &map);
 660}
 661
 662__setup("netdev=", netdev_boot_setup);
 663
 664/*******************************************************************************
 665
 666                            Device Interface Subroutines
 667
 668*******************************************************************************/
 669
 670/**
 671 *      dev_get_iflink  - get 'iflink' value of a interface
 672 *      @dev: targeted interface
 673 *
 674 *      Indicates the ifindex the interface is linked to.
 675 *      Physical interfaces have the same 'ifindex' and 'iflink' values.
 676 */
 677
 678int dev_get_iflink(const struct net_device *dev)
 679{
 680        if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 681                return dev->netdev_ops->ndo_get_iflink(dev);
 682
 683        return dev->ifindex;
 684}
 685EXPORT_SYMBOL(dev_get_iflink);
 686
 687/**
 688 *      dev_fill_metadata_dst - Retrieve tunnel egress information.
 689 *      @dev: targeted interface
 690 *      @skb: The packet.
 691 *
 692 *      For better visibility of tunnel traffic OVS needs to retrieve
 693 *      egress tunnel information for a packet. Following API allows
 694 *      user to get this info.
 695 */
 696int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 697{
 698        struct ip_tunnel_info *info;
 699
 700        if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 701                return -EINVAL;
 702
 703        info = skb_tunnel_info_unclone(skb);
 704        if (!info)
 705                return -ENOMEM;
 706        if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 707                return -EINVAL;
 708
 709        return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 710}
 711EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 712
 713/**
 714 *      __dev_get_by_name       - find a device by its name
 715 *      @net: the applicable net namespace
 716 *      @name: name to find
 717 *
 718 *      Find an interface by name. Must be called under RTNL semaphore
 719 *      or @dev_base_lock. If the name is found a pointer to the device
 720 *      is returned. If the name is not found then %NULL is returned. The
 721 *      reference counters are not incremented so the caller must be
 722 *      careful with locks.
 723 */
 724
 725struct net_device *__dev_get_by_name(struct net *net, const char *name)
 726{
 727        struct net_device *dev;
 728        struct hlist_head *head = dev_name_hash(net, name);
 729
 730        hlist_for_each_entry(dev, head, name_hlist)
 731                if (!strncmp(dev->name, name, IFNAMSIZ))
 732                        return dev;
 733
 734        return NULL;
 735}
 736EXPORT_SYMBOL(__dev_get_by_name);
 737
 738/**
 739 *      dev_get_by_name_rcu     - find a device by its name
 740 *      @net: the applicable net namespace
 741 *      @name: name to find
 742 *
 743 *      Find an interface by name.
 744 *      If the name is found a pointer to the device is returned.
 745 *      If the name is not found then %NULL is returned.
 746 *      The reference counters are not incremented so the caller must be
 747 *      careful with locks. The caller must hold RCU lock.
 748 */
 749
 750struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 751{
 752        struct net_device *dev;
 753        struct hlist_head *head = dev_name_hash(net, name);
 754
 755        hlist_for_each_entry_rcu(dev, head, name_hlist)
 756                if (!strncmp(dev->name, name, IFNAMSIZ))
 757                        return dev;
 758
 759        return NULL;
 760}
 761EXPORT_SYMBOL(dev_get_by_name_rcu);
 762
 763/**
 764 *      dev_get_by_name         - find a device by its name
 765 *      @net: the applicable net namespace
 766 *      @name: name to find
 767 *
 768 *      Find an interface by name. This can be called from any
 769 *      context and does its own locking. The returned handle has
 770 *      the usage count incremented and the caller must use dev_put() to
 771 *      release it when it is no longer needed. %NULL is returned if no
 772 *      matching device is found.
 773 */
 774
 775struct net_device *dev_get_by_name(struct net *net, const char *name)
 776{
 777        struct net_device *dev;
 778
 779        rcu_read_lock();
 780        dev = dev_get_by_name_rcu(net, name);
 781        if (dev)
 782                dev_hold(dev);
 783        rcu_read_unlock();
 784        return dev;
 785}
 786EXPORT_SYMBOL(dev_get_by_name);
 787
 788/**
 789 *      __dev_get_by_index - find a device by its ifindex
 790 *      @net: the applicable net namespace
 791 *      @ifindex: index of device
 792 *
 793 *      Search for an interface by index. Returns %NULL if the device
 794 *      is not found or a pointer to the device. The device has not
 795 *      had its reference counter increased so the caller must be careful
 796 *      about locking. The caller must hold either the RTNL semaphore
 797 *      or @dev_base_lock.
 798 */
 799
 800struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 801{
 802        struct net_device *dev;
 803        struct hlist_head *head = dev_index_hash(net, ifindex);
 804
 805        hlist_for_each_entry(dev, head, index_hlist)
 806                if (dev->ifindex == ifindex)
 807                        return dev;
 808
 809        return NULL;
 810}
 811EXPORT_SYMBOL(__dev_get_by_index);
 812
 813/**
 814 *      dev_get_by_index_rcu - find a device by its ifindex
 815 *      @net: the applicable net namespace
 816 *      @ifindex: index of device
 817 *
 818 *      Search for an interface by index. Returns %NULL if the device
 819 *      is not found or a pointer to the device. The device has not
 820 *      had its reference counter increased so the caller must be careful
 821 *      about locking. The caller must hold RCU lock.
 822 */
 823
 824struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 825{
 826        struct net_device *dev;
 827        struct hlist_head *head = dev_index_hash(net, ifindex);
 828
 829        hlist_for_each_entry_rcu(dev, head, index_hlist)
 830                if (dev->ifindex == ifindex)
 831                        return dev;
 832
 833        return NULL;
 834}
 835EXPORT_SYMBOL(dev_get_by_index_rcu);
 836
 837
 838/**
 839 *      dev_get_by_index - find a device by its ifindex
 840 *      @net: the applicable net namespace
 841 *      @ifindex: index of device
 842 *
 843 *      Search for an interface by index. Returns NULL if the device
 844 *      is not found or a pointer to the device. The device returned has
 845 *      had a reference added and the pointer is safe until the user calls
 846 *      dev_put to indicate they have finished with it.
 847 */
 848
 849struct net_device *dev_get_by_index(struct net *net, int ifindex)
 850{
 851        struct net_device *dev;
 852
 853        rcu_read_lock();
 854        dev = dev_get_by_index_rcu(net, ifindex);
 855        if (dev)
 856                dev_hold(dev);
 857        rcu_read_unlock();
 858        return dev;
 859}
 860EXPORT_SYMBOL(dev_get_by_index);
 861
 862/**
 863 *      netdev_get_name - get a netdevice name, knowing its ifindex.
 864 *      @net: network namespace
 865 *      @name: a pointer to the buffer where the name will be stored.
 866 *      @ifindex: the ifindex of the interface to get the name from.
 867 *
 868 *      The use of raw_seqcount_begin() and cond_resched() before
 869 *      retrying is required as we want to give the writers a chance
 870 *      to complete when CONFIG_PREEMPT is not set.
 871 */
 872int netdev_get_name(struct net *net, char *name, int ifindex)
 873{
 874        struct net_device *dev;
 875        unsigned int seq;
 876
 877retry:
 878        seq = raw_seqcount_begin(&devnet_rename_seq);
 879        rcu_read_lock();
 880        dev = dev_get_by_index_rcu(net, ifindex);
 881        if (!dev) {
 882                rcu_read_unlock();
 883                return -ENODEV;
 884        }
 885
 886        strcpy(name, dev->name);
 887        rcu_read_unlock();
 888        if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 889                cond_resched();
 890                goto retry;
 891        }
 892
 893        return 0;
 894}
 895
 896/**
 897 *      dev_getbyhwaddr_rcu - find a device by its hardware address
 898 *      @net: the applicable net namespace
 899 *      @type: media type of device
 900 *      @ha: hardware address
 901 *
 902 *      Search for an interface by MAC address. Returns NULL if the device
 903 *      is not found or a pointer to the device.
 904 *      The caller must hold RCU or RTNL.
 905 *      The returned device has not had its ref count increased
 906 *      and the caller must therefore be careful about locking
 907 *
 908 */
 909
 910struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 911                                       const char *ha)
 912{
 913        struct net_device *dev;
 914
 915        for_each_netdev_rcu(net, dev)
 916                if (dev->type == type &&
 917                    !memcmp(dev->dev_addr, ha, dev->addr_len))
 918                        return dev;
 919
 920        return NULL;
 921}
 922EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 923
 924struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 925{
 926        struct net_device *dev;
 927
 928        ASSERT_RTNL();
 929        for_each_netdev(net, dev)
 930                if (dev->type == type)
 931                        return dev;
 932
 933        return NULL;
 934}
 935EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 936
 937struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 938{
 939        struct net_device *dev, *ret = NULL;
 940
 941        rcu_read_lock();
 942        for_each_netdev_rcu(net, dev)
 943                if (dev->type == type) {
 944                        dev_hold(dev);
 945                        ret = dev;
 946                        break;
 947                }
 948        rcu_read_unlock();
 949        return ret;
 950}
 951EXPORT_SYMBOL(dev_getfirstbyhwtype);
 952
 953/**
 954 *      __dev_get_by_flags - find any device with given flags
 955 *      @net: the applicable net namespace
 956 *      @if_flags: IFF_* values
 957 *      @mask: bitmask of bits in if_flags to check
 958 *
 959 *      Search for any interface with the given flags. Returns NULL if a device
 960 *      is not found or a pointer to the device. Must be called inside
 961 *      rtnl_lock(), and result refcount is unchanged.
 962 */
 963
 964struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 965                                      unsigned short mask)
 966{
 967        struct net_device *dev, *ret;
 968
 969        ASSERT_RTNL();
 970
 971        ret = NULL;
 972        for_each_netdev(net, dev) {
 973                if (((dev->flags ^ if_flags) & mask) == 0) {
 974                        ret = dev;
 975                        break;
 976                }
 977        }
 978        return ret;
 979}
 980EXPORT_SYMBOL(__dev_get_by_flags);
 981
 982/**
 983 *      dev_valid_name - check if name is okay for network device
 984 *      @name: name string
 985 *
 986 *      Network device names need to be valid file names to
 987 *      to allow sysfs to work.  We also disallow any kind of
 988 *      whitespace.
 989 */
 990bool dev_valid_name(const char *name)
 991{
 992        if (*name == '\0')
 993                return false;
 994        if (strlen(name) >= IFNAMSIZ)
 995                return false;
 996        if (!strcmp(name, ".") || !strcmp(name, ".."))
 997                return false;
 998
 999        while (*name) {
1000                if (*name == '/' || *name == ':' || isspace(*name))
1001                        return false;
1002                name++;
1003        }
1004        return true;
1005}
1006EXPORT_SYMBOL(dev_valid_name);
1007
1008/**
1009 *      __dev_alloc_name - allocate a name for a device
1010 *      @net: network namespace to allocate the device name in
1011 *      @name: name format string
1012 *      @buf:  scratch buffer and result name string
1013 *
1014 *      Passed a format string - eg "lt%d" it will try and find a suitable
1015 *      id. It scans list of devices to build up a free map, then chooses
1016 *      the first empty slot. The caller must hold the dev_base or rtnl lock
1017 *      while allocating the name and adding the device in order to avoid
1018 *      duplicates.
1019 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1020 *      Returns the number of the unit assigned or a negative errno code.
1021 */
1022
1023static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1024{
1025        int i = 0;
1026        const char *p;
1027        const int max_netdevices = 8*PAGE_SIZE;
1028        unsigned long *inuse;
1029        struct net_device *d;
1030
1031        p = strnchr(name, IFNAMSIZ-1, '%');
1032        if (p) {
1033                /*
1034                 * Verify the string as this thing may have come from
1035                 * the user.  There must be either one "%d" and no other "%"
1036                 * characters.
1037                 */
1038                if (p[1] != 'd' || strchr(p + 2, '%'))
1039                        return -EINVAL;
1040
1041                /* Use one page as a bit array of possible slots */
1042                inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1043                if (!inuse)
1044                        return -ENOMEM;
1045
1046                for_each_netdev(net, d) {
1047                        if (!sscanf(d->name, name, &i))
1048                                continue;
1049                        if (i < 0 || i >= max_netdevices)
1050                                continue;
1051
1052                        /*  avoid cases where sscanf is not exact inverse of printf */
1053                        snprintf(buf, IFNAMSIZ, name, i);
1054                        if (!strncmp(buf, d->name, IFNAMSIZ))
1055                                set_bit(i, inuse);
1056                }
1057
1058                i = find_first_zero_bit(inuse, max_netdevices);
1059                free_page((unsigned long) inuse);
1060        }
1061
1062        if (buf != name)
1063                snprintf(buf, IFNAMSIZ, name, i);
1064        if (!__dev_get_by_name(net, buf))
1065                return i;
1066
1067        /* It is possible to run out of possible slots
1068         * when the name is long and there isn't enough space left
1069         * for the digits, or if all bits are used.
1070         */
1071        return -ENFILE;
1072}
1073
1074/**
1075 *      dev_alloc_name - allocate a name for a device
1076 *      @dev: device
1077 *      @name: name format string
1078 *
1079 *      Passed a format string - eg "lt%d" it will try and find a suitable
1080 *      id. It scans list of devices to build up a free map, then chooses
1081 *      the first empty slot. The caller must hold the dev_base or rtnl lock
1082 *      while allocating the name and adding the device in order to avoid
1083 *      duplicates.
1084 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1085 *      Returns the number of the unit assigned or a negative errno code.
1086 */
1087
1088int dev_alloc_name(struct net_device *dev, const char *name)
1089{
1090        char buf[IFNAMSIZ];
1091        struct net *net;
1092        int ret;
1093
1094        BUG_ON(!dev_net(dev));
1095        net = dev_net(dev);
1096        ret = __dev_alloc_name(net, name, buf);
1097        if (ret >= 0)
1098                strlcpy(dev->name, buf, IFNAMSIZ);
1099        return ret;
1100}
1101EXPORT_SYMBOL(dev_alloc_name);
1102
1103static int dev_alloc_name_ns(struct net *net,
1104                             struct net_device *dev,
1105                             const char *name)
1106{
1107        char buf[IFNAMSIZ];
1108        int ret;
1109
1110        ret = __dev_alloc_name(net, name, buf);
1111        if (ret >= 0)
1112                strlcpy(dev->name, buf, IFNAMSIZ);
1113        return ret;
1114}
1115
1116static int dev_get_valid_name(struct net *net,
1117                              struct net_device *dev,
1118                              const char *name)
1119{
1120        BUG_ON(!net);
1121
1122        if (!dev_valid_name(name))
1123                return -EINVAL;
1124
1125        if (strchr(name, '%'))
1126                return dev_alloc_name_ns(net, dev, name);
1127        else if (__dev_get_by_name(net, name))
1128                return -EEXIST;
1129        else if (dev->name != name)
1130                strlcpy(dev->name, name, IFNAMSIZ);
1131
1132        return 0;
1133}
1134
1135/**
1136 *      dev_change_name - change name of a device
1137 *      @dev: device
1138 *      @newname: name (or format string) must be at least IFNAMSIZ
1139 *
1140 *      Change name of a device, can pass format strings "eth%d".
1141 *      for wildcarding.
1142 */
1143int dev_change_name(struct net_device *dev, const char *newname)
1144{
1145        unsigned char old_assign_type;
1146        char oldname[IFNAMSIZ];
1147        int err = 0;
1148        int ret;
1149        struct net *net;
1150
1151        ASSERT_RTNL();
1152        BUG_ON(!dev_net(dev));
1153
1154        net = dev_net(dev);
1155        if (dev->flags & IFF_UP)
1156                return -EBUSY;
1157
1158        write_seqcount_begin(&devnet_rename_seq);
1159
1160        if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1161                write_seqcount_end(&devnet_rename_seq);
1162                return 0;
1163        }
1164
1165        memcpy(oldname, dev->name, IFNAMSIZ);
1166
1167        err = dev_get_valid_name(net, dev, newname);
1168        if (err < 0) {
1169                write_seqcount_end(&devnet_rename_seq);
1170                return err;
1171        }
1172
1173        if (oldname[0] && !strchr(oldname, '%'))
1174                netdev_info(dev, "renamed from %s\n", oldname);
1175
1176        old_assign_type = dev->name_assign_type;
1177        dev->name_assign_type = NET_NAME_RENAMED;
1178
1179rollback:
1180        ret = device_rename(&dev->dev, dev->name);
1181        if (ret) {
1182                memcpy(dev->name, oldname, IFNAMSIZ);
1183                dev->name_assign_type = old_assign_type;
1184                write_seqcount_end(&devnet_rename_seq);
1185                return ret;
1186        }
1187
1188        write_seqcount_end(&devnet_rename_seq);
1189
1190        netdev_adjacent_rename_links(dev, oldname);
1191
1192        write_lock_bh(&dev_base_lock);
1193        hlist_del_rcu(&dev->name_hlist);
1194        write_unlock_bh(&dev_base_lock);
1195
1196        synchronize_rcu();
1197
1198        write_lock_bh(&dev_base_lock);
1199        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1200        write_unlock_bh(&dev_base_lock);
1201
1202        ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1203        ret = notifier_to_errno(ret);
1204
1205        if (ret) {
1206                /* err >= 0 after dev_alloc_name() or stores the first errno */
1207                if (err >= 0) {
1208                        err = ret;
1209                        write_seqcount_begin(&devnet_rename_seq);
1210                        memcpy(dev->name, oldname, IFNAMSIZ);
1211                        memcpy(oldname, newname, IFNAMSIZ);
1212                        dev->name_assign_type = old_assign_type;
1213                        old_assign_type = NET_NAME_RENAMED;
1214                        goto rollback;
1215                } else {
1216                        pr_err("%s: name change rollback failed: %d\n",
1217                               dev->name, ret);
1218                }
1219        }
1220
1221        return err;
1222}
1223
1224/**
1225 *      dev_set_alias - change ifalias of a device
1226 *      @dev: device
1227 *      @alias: name up to IFALIASZ
1228 *      @len: limit of bytes to copy from info
1229 *
1230 *      Set ifalias for a device,
1231 */
1232int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1233{
1234        char *new_ifalias;
1235
1236        ASSERT_RTNL();
1237
1238        if (len >= IFALIASZ)
1239                return -EINVAL;
1240
1241        if (!len) {
1242                kfree(dev->ifalias);
1243                dev->ifalias = NULL;
1244                return 0;
1245        }
1246
1247        new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1248        if (!new_ifalias)
1249                return -ENOMEM;
1250        dev->ifalias = new_ifalias;
1251
1252        strlcpy(dev->ifalias, alias, len+1);
1253        return len;
1254}
1255
1256
1257/**
1258 *      netdev_features_change - device changes features
1259 *      @dev: device to cause notification
1260 *
1261 *      Called to indicate a device has changed features.
1262 */
1263void netdev_features_change(struct net_device *dev)
1264{
1265        call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1266}
1267EXPORT_SYMBOL(netdev_features_change);
1268
1269/**
1270 *      netdev_state_change - device changes state
1271 *      @dev: device to cause notification
1272 *
1273 *      Called to indicate a device has changed state. This function calls
1274 *      the notifier chains for netdev_chain and sends a NEWLINK message
1275 *      to the routing socket.
1276 */
1277void netdev_state_change(struct net_device *dev)
1278{
1279        if (dev->flags & IFF_UP) {
1280                struct netdev_notifier_change_info change_info;
1281
1282                change_info.flags_changed = 0;
1283                call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1284                                              &change_info.info);
1285                rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1286        }
1287}
1288EXPORT_SYMBOL(netdev_state_change);
1289
1290/**
1291 *      netdev_notify_peers - notify network peers about existence of @dev
1292 *      @dev: network device
1293 *
1294 * Generate traffic such that interested network peers are aware of
1295 * @dev, such as by generating a gratuitous ARP. This may be used when
1296 * a device wants to inform the rest of the network about some sort of
1297 * reconfiguration such as a failover event or virtual machine
1298 * migration.
1299 */
1300void netdev_notify_peers(struct net_device *dev)
1301{
1302        rtnl_lock();
1303        call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1304        rtnl_unlock();
1305}
1306EXPORT_SYMBOL(netdev_notify_peers);
1307
1308static int __dev_open(struct net_device *dev)
1309{
1310        const struct net_device_ops *ops = dev->netdev_ops;
1311        int ret;
1312
1313        ASSERT_RTNL();
1314
1315        if (!netif_device_present(dev))
1316                return -ENODEV;
1317
1318        /* Block netpoll from trying to do any rx path servicing.
1319         * If we don't do this there is a chance ndo_poll_controller
1320         * or ndo_poll may be running while we open the device
1321         */
1322        netpoll_poll_disable(dev);
1323
1324        ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1325        ret = notifier_to_errno(ret);
1326        if (ret)
1327                return ret;
1328
1329        set_bit(__LINK_STATE_START, &dev->state);
1330
1331        if (ops->ndo_validate_addr)
1332                ret = ops->ndo_validate_addr(dev);
1333
1334        if (!ret && ops->ndo_open)
1335                ret = ops->ndo_open(dev);
1336
1337        netpoll_poll_enable(dev);
1338
1339        if (ret)
1340                clear_bit(__LINK_STATE_START, &dev->state);
1341        else {
1342                dev->flags |= IFF_UP;
1343                dev_set_rx_mode(dev);
1344                dev_activate(dev);
1345                add_device_randomness(dev->dev_addr, dev->addr_len);
1346        }
1347
1348        return ret;
1349}
1350
1351/**
1352 *      dev_open        - prepare an interface for use.
1353 *      @dev:   device to open
1354 *
1355 *      Takes a device from down to up state. The device's private open
1356 *      function is invoked and then the multicast lists are loaded. Finally
1357 *      the device is moved into the up state and a %NETDEV_UP message is
1358 *      sent to the netdev notifier chain.
1359 *
1360 *      Calling this function on an active interface is a nop. On a failure
1361 *      a negative errno code is returned.
1362 */
1363int dev_open(struct net_device *dev)
1364{
1365        int ret;
1366
1367        if (dev->flags & IFF_UP)
1368                return 0;
1369
1370        ret = __dev_open(dev);
1371        if (ret < 0)
1372                return ret;
1373
1374        rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1375        call_netdevice_notifiers(NETDEV_UP, dev);
1376
1377        return ret;
1378}
1379EXPORT_SYMBOL(dev_open);
1380
1381static int __dev_close_many(struct list_head *head)
1382{
1383        struct net_device *dev;
1384
1385        ASSERT_RTNL();
1386        might_sleep();
1387
1388        list_for_each_entry(dev, head, close_list) {
1389                /* Temporarily disable netpoll until the interface is down */
1390                netpoll_poll_disable(dev);
1391
1392                call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1393
1394                clear_bit(__LINK_STATE_START, &dev->state);
1395
1396                /* Synchronize to scheduled poll. We cannot touch poll list, it
1397                 * can be even on different cpu. So just clear netif_running().
1398                 *
1399                 * dev->stop() will invoke napi_disable() on all of it's
1400                 * napi_struct instances on this device.
1401                 */
1402                smp_mb__after_atomic(); /* Commit netif_running(). */
1403        }
1404
1405        dev_deactivate_many(head);
1406
1407        list_for_each_entry(dev, head, close_list) {
1408                const struct net_device_ops *ops = dev->netdev_ops;
1409
1410                /*
1411                 *      Call the device specific close. This cannot fail.
1412                 *      Only if device is UP
1413                 *
1414                 *      We allow it to be called even after a DETACH hot-plug
1415                 *      event.
1416                 */
1417                if (ops->ndo_stop)
1418                        ops->ndo_stop(dev);
1419
1420                dev->flags &= ~IFF_UP;
1421                netpoll_poll_enable(dev);
1422        }
1423
1424        return 0;
1425}
1426
1427static int __dev_close(struct net_device *dev)
1428{
1429        int retval;
1430        LIST_HEAD(single);
1431
1432        list_add(&dev->close_list, &single);
1433        retval = __dev_close_many(&single);
1434        list_del(&single);
1435
1436        return retval;
1437}
1438
1439int dev_close_many(struct list_head *head, bool unlink)
1440{
1441        struct net_device *dev, *tmp;
1442
1443        /* Remove the devices that don't need to be closed */
1444        list_for_each_entry_safe(dev, tmp, head, close_list)
1445                if (!(dev->flags & IFF_UP))
1446                        list_del_init(&dev->close_list);
1447
1448        __dev_close_many(head);
1449
1450        list_for_each_entry_safe(dev, tmp, head, close_list) {
1451                rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1452                call_netdevice_notifiers(NETDEV_DOWN, dev);
1453                if (unlink)
1454                        list_del_init(&dev->close_list);
1455        }
1456
1457        return 0;
1458}
1459EXPORT_SYMBOL(dev_close_many);
1460
1461/**
1462 *      dev_close - shutdown an interface.
1463 *      @dev: device to shutdown
1464 *
1465 *      This function moves an active device into down state. A
1466 *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1467 *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1468 *      chain.
1469 */
1470int dev_close(struct net_device *dev)
1471{
1472        if (dev->flags & IFF_UP) {
1473                LIST_HEAD(single);
1474
1475                list_add(&dev->close_list, &single);
1476                dev_close_many(&single, true);
1477                list_del(&single);
1478        }
1479        return 0;
1480}
1481EXPORT_SYMBOL(dev_close);
1482
1483
1484/**
1485 *      dev_disable_lro - disable Large Receive Offload on a device
1486 *      @dev: device
1487 *
1488 *      Disable Large Receive Offload (LRO) on a net device.  Must be
1489 *      called under RTNL.  This is needed if received packets may be
1490 *      forwarded to another interface.
1491 */
1492void dev_disable_lro(struct net_device *dev)
1493{
1494        struct net_device *lower_dev;
1495        struct list_head *iter;
1496
1497        dev->wanted_features &= ~NETIF_F_LRO;
1498        netdev_update_features(dev);
1499
1500        if (unlikely(dev->features & NETIF_F_LRO))
1501                netdev_WARN(dev, "failed to disable LRO!\n");
1502
1503        netdev_for_each_lower_dev(dev, lower_dev, iter)
1504                dev_disable_lro(lower_dev);
1505}
1506EXPORT_SYMBOL(dev_disable_lro);
1507
1508static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1509                                   struct net_device *dev)
1510{
1511        struct netdev_notifier_info info;
1512
1513        netdev_notifier_info_init(&info, dev);
1514        return nb->notifier_call(nb, val, &info);
1515}
1516
1517static int dev_boot_phase = 1;
1518
1519/**
1520 *      register_netdevice_notifier - register a network notifier block
1521 *      @nb: notifier
1522 *
1523 *      Register a notifier to be called when network device events occur.
1524 *      The notifier passed is linked into the kernel structures and must
1525 *      not be reused until it has been unregistered. A negative errno code
1526 *      is returned on a failure.
1527 *
1528 *      When registered all registration and up events are replayed
1529 *      to the new notifier to allow device to have a race free
1530 *      view of the network device list.
1531 */
1532
1533int register_netdevice_notifier(struct notifier_block *nb)
1534{
1535        struct net_device *dev;
1536        struct net_device *last;
1537        struct net *net;
1538        int err;
1539
1540        rtnl_lock();
1541        err = raw_notifier_chain_register(&netdev_chain, nb);
1542        if (err)
1543                goto unlock;
1544        if (dev_boot_phase)
1545                goto unlock;
1546        for_each_net(net) {
1547                for_each_netdev(net, dev) {
1548                        err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1549                        err = notifier_to_errno(err);
1550                        if (err)
1551                                goto rollback;
1552
1553                        if (!(dev->flags & IFF_UP))
1554                                continue;
1555
1556                        call_netdevice_notifier(nb, NETDEV_UP, dev);
1557                }
1558        }
1559
1560unlock:
1561        rtnl_unlock();
1562        return err;
1563
1564rollback:
1565        last = dev;
1566        for_each_net(net) {
1567                for_each_netdev(net, dev) {
1568                        if (dev == last)
1569                                goto outroll;
1570
1571                        if (dev->flags & IFF_UP) {
1572                                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1573                                                        dev);
1574                                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1575                        }
1576                        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1577                }
1578        }
1579
1580outroll:
1581        raw_notifier_chain_unregister(&netdev_chain, nb);
1582        goto unlock;
1583}
1584EXPORT_SYMBOL(register_netdevice_notifier);
1585
1586/**
1587 *      unregister_netdevice_notifier - unregister a network notifier block
1588 *      @nb: notifier
1589 *
1590 *      Unregister a notifier previously registered by
1591 *      register_netdevice_notifier(). The notifier is unlinked into the
1592 *      kernel structures and may then be reused. A negative errno code
1593 *      is returned on a failure.
1594 *
1595 *      After unregistering unregister and down device events are synthesized
1596 *      for all devices on the device list to the removed notifier to remove
1597 *      the need for special case cleanup code.
1598 */
1599
1600int unregister_netdevice_notifier(struct notifier_block *nb)
1601{
1602        struct net_device *dev;
1603        struct net *net;
1604        int err;
1605
1606        rtnl_lock();
1607        err = raw_notifier_chain_unregister(&netdev_chain, nb);
1608        if (err)
1609                goto unlock;
1610
1611        for_each_net(net) {
1612                for_each_netdev(net, dev) {
1613                        if (dev->flags & IFF_UP) {
1614                                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1615                                                        dev);
1616                                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1617                        }
1618                        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1619                }
1620        }
1621unlock:
1622        rtnl_unlock();
1623        return err;
1624}
1625EXPORT_SYMBOL(unregister_netdevice_notifier);
1626
1627/**
1628 *      call_netdevice_notifiers_info - call all network notifier blocks
1629 *      @val: value passed unmodified to notifier function
1630 *      @dev: net_device pointer passed unmodified to notifier function
1631 *      @info: notifier information data
1632 *
1633 *      Call all network notifier blocks.  Parameters and return value
1634 *      are as for raw_notifier_call_chain().
1635 */
1636
1637static int call_netdevice_notifiers_info(unsigned long val,
1638                                         struct net_device *dev,
1639                                         struct netdev_notifier_info *info)
1640{
1641        ASSERT_RTNL();
1642        netdev_notifier_info_init(info, dev);
1643        return raw_notifier_call_chain(&netdev_chain, val, info);
1644}
1645
1646/**
1647 *      call_netdevice_notifiers - call all network notifier blocks
1648 *      @val: value passed unmodified to notifier function
1649 *      @dev: net_device pointer passed unmodified to notifier function
1650 *
1651 *      Call all network notifier blocks.  Parameters and return value
1652 *      are as for raw_notifier_call_chain().
1653 */
1654
1655int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1656{
1657        struct netdev_notifier_info info;
1658
1659        return call_netdevice_notifiers_info(val, dev, &info);
1660}
1661EXPORT_SYMBOL(call_netdevice_notifiers);
1662
1663#ifdef CONFIG_NET_INGRESS
1664static struct static_key ingress_needed __read_mostly;
1665
1666void net_inc_ingress_queue(void)
1667{
1668        static_key_slow_inc(&ingress_needed);
1669}
1670EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1671
1672void net_dec_ingress_queue(void)
1673{
1674        static_key_slow_dec(&ingress_needed);
1675}
1676EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1677#endif
1678
1679#ifdef CONFIG_NET_EGRESS
1680static struct static_key egress_needed __read_mostly;
1681
1682void net_inc_egress_queue(void)
1683{
1684        static_key_slow_inc(&egress_needed);
1685}
1686EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1687
1688void net_dec_egress_queue(void)
1689{
1690        static_key_slow_dec(&egress_needed);
1691}
1692EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1693#endif
1694
1695static struct static_key netstamp_needed __read_mostly;
1696#ifdef HAVE_JUMP_LABEL
1697/* We are not allowed to call static_key_slow_dec() from irq context
1698 * If net_disable_timestamp() is called from irq context, defer the
1699 * static_key_slow_dec() calls.
1700 */
1701static atomic_t netstamp_needed_deferred;
1702#endif
1703
1704void net_enable_timestamp(void)
1705{
1706#ifdef HAVE_JUMP_LABEL
1707        int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1708
1709        if (deferred) {
1710                while (--deferred)
1711                        static_key_slow_dec(&netstamp_needed);
1712                return;
1713        }
1714#endif
1715        static_key_slow_inc(&netstamp_needed);
1716}
1717EXPORT_SYMBOL(net_enable_timestamp);
1718
1719void net_disable_timestamp(void)
1720{
1721#ifdef HAVE_JUMP_LABEL
1722        if (in_interrupt()) {
1723                atomic_inc(&netstamp_needed_deferred);
1724                return;
1725        }
1726#endif
1727        static_key_slow_dec(&netstamp_needed);
1728}
1729EXPORT_SYMBOL(net_disable_timestamp);
1730
1731static inline void net_timestamp_set(struct sk_buff *skb)
1732{
1733        skb->tstamp.tv64 = 0;
1734        if (static_key_false(&netstamp_needed))
1735                __net_timestamp(skb);
1736}
1737
1738#define net_timestamp_check(COND, SKB)                  \
1739        if (static_key_false(&netstamp_needed)) {               \
1740                if ((COND) && !(SKB)->tstamp.tv64)      \
1741                        __net_timestamp(SKB);           \
1742        }                                               \
1743
1744bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1745{
1746        unsigned int len;
1747
1748        if (!(dev->flags & IFF_UP))
1749                return false;
1750
1751        len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1752        if (skb->len <= len)
1753                return true;
1754
1755        /* if TSO is enabled, we don't care about the length as the packet
1756         * could be forwarded without being segmented before
1757         */
1758        if (skb_is_gso(skb))
1759                return true;
1760
1761        return false;
1762}
1763EXPORT_SYMBOL_GPL(is_skb_forwardable);
1764
1765int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1766{
1767        if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1768            unlikely(!is_skb_forwardable(dev, skb))) {
1769                atomic_long_inc(&dev->rx_dropped);
1770                kfree_skb(skb);
1771                return NET_RX_DROP;
1772        }
1773
1774        skb_scrub_packet(skb, true);
1775        skb->priority = 0;
1776        skb->protocol = eth_type_trans(skb, dev);
1777        skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1778
1779        return 0;
1780}
1781EXPORT_SYMBOL_GPL(__dev_forward_skb);
1782
1783/**
1784 * dev_forward_skb - loopback an skb to another netif
1785 *
1786 * @dev: destination network device
1787 * @skb: buffer to forward
1788 *
1789 * return values:
1790 *      NET_RX_SUCCESS  (no congestion)
1791 *      NET_RX_DROP     (packet was dropped, but freed)
1792 *
1793 * dev_forward_skb can be used for injecting an skb from the
1794 * start_xmit function of one device into the receive queue
1795 * of another device.
1796 *
1797 * The receiving device may be in another namespace, so
1798 * we have to clear all information in the skb that could
1799 * impact namespace isolation.
1800 */
1801int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1802{
1803        return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1804}
1805EXPORT_SYMBOL_GPL(dev_forward_skb);
1806
1807static inline int deliver_skb(struct sk_buff *skb,
1808                              struct packet_type *pt_prev,
1809                              struct net_device *orig_dev)
1810{
1811        if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1812                return -ENOMEM;
1813        atomic_inc(&skb->users);
1814        return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1815}
1816
1817static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1818                                          struct packet_type **pt,
1819                                          struct net_device *orig_dev,
1820                                          __be16 type,
1821                                          struct list_head *ptype_list)
1822{
1823        struct packet_type *ptype, *pt_prev = *pt;
1824
1825        list_for_each_entry_rcu(ptype, ptype_list, list) {
1826                if (ptype->type != type)
1827                        continue;
1828                if (pt_prev)
1829                        deliver_skb(skb, pt_prev, orig_dev);
1830                pt_prev = ptype;
1831        }
1832        *pt = pt_prev;
1833}
1834
1835static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1836{
1837        if (!ptype->af_packet_priv || !skb->sk)
1838                return false;
1839
1840        if (ptype->id_match)
1841                return ptype->id_match(ptype, skb->sk);
1842        else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1843                return true;
1844
1845        return false;
1846}
1847
1848/*
1849 *      Support routine. Sends outgoing frames to any network
1850 *      taps currently in use.
1851 */
1852
1853void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1854{
1855        struct packet_type *ptype;
1856        struct sk_buff *skb2 = NULL;
1857        struct packet_type *pt_prev = NULL;
1858        struct list_head *ptype_list = &ptype_all;
1859
1860        rcu_read_lock();
1861again:
1862        list_for_each_entry_rcu(ptype, ptype_list, list) {
1863                /* Never send packets back to the socket
1864                 * they originated from - MvS (miquels@drinkel.ow.org)
1865                 */
1866                if (skb_loop_sk(ptype, skb))
1867                        continue;
1868
1869                if (pt_prev) {
1870                        deliver_skb(skb2, pt_prev, skb->dev);
1871                        pt_prev = ptype;
1872                        continue;
1873                }
1874
1875                /* need to clone skb, done only once */
1876                skb2 = skb_clone(skb, GFP_ATOMIC);
1877                if (!skb2)
1878                        goto out_unlock;
1879
1880                net_timestamp_set(skb2);
1881
1882                /* skb->nh should be correctly
1883                 * set by sender, so that the second statement is
1884                 * just protection against buggy protocols.
1885                 */
1886                skb_reset_mac_header(skb2);
1887
1888                if (skb_network_header(skb2) < skb2->data ||
1889                    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1890                        net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1891                                             ntohs(skb2->protocol),
1892                                             dev->name);
1893                        skb_reset_network_header(skb2);
1894                }
1895
1896                skb2->transport_header = skb2->network_header;
1897                skb2->pkt_type = PACKET_OUTGOING;
1898                pt_prev = ptype;
1899        }
1900
1901        if (ptype_list == &ptype_all) {
1902                ptype_list = &dev->ptype_all;
1903                goto again;
1904        }
1905out_unlock:
1906        if (pt_prev)
1907                pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1908        rcu_read_unlock();
1909}
1910EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1911
1912/**
1913 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1914 * @dev: Network device
1915 * @txq: number of queues available
1916 *
1917 * If real_num_tx_queues is changed the tc mappings may no longer be
1918 * valid. To resolve this verify the tc mapping remains valid and if
1919 * not NULL the mapping. With no priorities mapping to this
1920 * offset/count pair it will no longer be used. In the worst case TC0
1921 * is invalid nothing can be done so disable priority mappings. If is
1922 * expected that drivers will fix this mapping if they can before
1923 * calling netif_set_real_num_tx_queues.
1924 */
1925static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1926{
1927        int i;
1928        struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1929
1930        /* If TC0 is invalidated disable TC mapping */
1931        if (tc->offset + tc->count > txq) {
1932                pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1933                dev->num_tc = 0;
1934                return;
1935        }
1936
1937        /* Invalidated prio to tc mappings set to TC0 */
1938        for (i = 1; i < TC_BITMASK + 1; i++) {
1939                int q = netdev_get_prio_tc_map(dev, i);
1940
1941                tc = &dev->tc_to_txq[q];
1942                if (tc->offset + tc->count > txq) {
1943                        pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1944                                i, q);
1945                        netdev_set_prio_tc_map(dev, i, 0);
1946                }
1947        }
1948}
1949
1950#ifdef CONFIG_XPS
1951static DEFINE_MUTEX(xps_map_mutex);
1952#define xmap_dereference(P)             \
1953        rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1954
1955static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1956                                        int cpu, u16 index)
1957{
1958        struct xps_map *map = NULL;
1959        int pos;
1960
1961        if (dev_maps)
1962                map = xmap_dereference(dev_maps->cpu_map[cpu]);
1963
1964        for (pos = 0; map && pos < map->len; pos++) {
1965                if (map->queues[pos] == index) {
1966                        if (map->len > 1) {
1967                                map->queues[pos] = map->queues[--map->len];
1968                        } else {
1969                                RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1970                                kfree_rcu(map, rcu);
1971                                map = NULL;
1972                        }
1973                        break;
1974                }
1975        }
1976
1977        return map;
1978}
1979
1980static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1981{
1982        struct xps_dev_maps *dev_maps;
1983        int cpu, i;
1984        bool active = false;
1985
1986        mutex_lock(&xps_map_mutex);
1987        dev_maps = xmap_dereference(dev->xps_maps);
1988
1989        if (!dev_maps)
1990                goto out_no_maps;
1991
1992        for_each_possible_cpu(cpu) {
1993                for (i = index; i < dev->num_tx_queues; i++) {
1994                        if (!remove_xps_queue(dev_maps, cpu, i))
1995                                break;
1996                }
1997                if (i == dev->num_tx_queues)
1998                        active = true;
1999        }
2000
2001        if (!active) {
2002                RCU_INIT_POINTER(dev->xps_maps, NULL);
2003                kfree_rcu(dev_maps, rcu);
2004        }
2005
2006        for (i = index; i < dev->num_tx_queues; i++)
2007                netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2008                                             NUMA_NO_NODE);
2009
2010out_no_maps:
2011        mutex_unlock(&xps_map_mutex);
2012}
2013
2014static struct xps_map *expand_xps_map(struct xps_map *map,
2015                                      int cpu, u16 index)
2016{
2017        struct xps_map *new_map;
2018        int alloc_len = XPS_MIN_MAP_ALLOC;
2019        int i, pos;
2020
2021        for (pos = 0; map && pos < map->len; pos++) {
2022                if (map->queues[pos] != index)
2023                        continue;
2024                return map;
2025        }
2026
2027        /* Need to add queue to this CPU's existing map */
2028        if (map) {
2029                if (pos < map->alloc_len)
2030                        return map;
2031
2032                alloc_len = map->alloc_len * 2;
2033        }
2034
2035        /* Need to allocate new map to store queue on this CPU's map */
2036        new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2037                               cpu_to_node(cpu));
2038        if (!new_map)
2039                return NULL;
2040
2041        for (i = 0; i < pos; i++)
2042                new_map->queues[i] = map->queues[i];
2043        new_map->alloc_len = alloc_len;
2044        new_map->len = pos;
2045
2046        return new_map;
2047}
2048
2049int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2050                        u16 index)
2051{
2052        struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2053        struct xps_map *map, *new_map;
2054        int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2055        int cpu, numa_node_id = -2;
2056        bool active = false;
2057
2058        mutex_lock(&xps_map_mutex);
2059
2060        dev_maps = xmap_dereference(dev->xps_maps);
2061
2062        /* allocate memory for queue storage */
2063        for_each_online_cpu(cpu) {
2064                if (!cpumask_test_cpu(cpu, mask))
2065                        continue;
2066
2067                if (!new_dev_maps)
2068                        new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2069                if (!new_dev_maps) {
2070                        mutex_unlock(&xps_map_mutex);
2071                        return -ENOMEM;
2072                }
2073
2074                map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2075                                 NULL;
2076
2077                map = expand_xps_map(map, cpu, index);
2078                if (!map)
2079                        goto error;
2080
2081                RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2082        }
2083
2084        if (!new_dev_maps)
2085                goto out_no_new_maps;
2086
2087        for_each_possible_cpu(cpu) {
2088                if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2089                        /* add queue to CPU maps */
2090                        int pos = 0;
2091
2092                        map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2093                        while ((pos < map->len) && (map->queues[pos] != index))
2094                                pos++;
2095
2096                        if (pos == map->len)
2097                                map->queues[map->len++] = index;
2098#ifdef CONFIG_NUMA
2099                        if (numa_node_id == -2)
2100                                numa_node_id = cpu_to_node(cpu);
2101                        else if (numa_node_id != cpu_to_node(cpu))
2102                                numa_node_id = -1;
2103#endif
2104                } else if (dev_maps) {
2105                        /* fill in the new device map from the old device map */
2106                        map = xmap_dereference(dev_maps->cpu_map[cpu]);
2107                        RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2108                }
2109
2110        }
2111
2112        rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2113
2114        /* Cleanup old maps */
2115        if (dev_maps) {
2116                for_each_possible_cpu(cpu) {
2117                        new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2118                        map = xmap_dereference(dev_maps->cpu_map[cpu]);
2119                        if (map && map != new_map)
2120                                kfree_rcu(map, rcu);
2121                }
2122
2123                kfree_rcu(dev_maps, rcu);
2124        }
2125
2126        dev_maps = new_dev_maps;
2127        active = true;
2128
2129out_no_new_maps:
2130        /* update Tx queue numa node */
2131        netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2132                                     (numa_node_id >= 0) ? numa_node_id :
2133                                     NUMA_NO_NODE);
2134
2135        if (!dev_maps)
2136                goto out_no_maps;
2137
2138        /* removes queue from unused CPUs */
2139        for_each_possible_cpu(cpu) {
2140                if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2141                        continue;
2142
2143                if (remove_xps_queue(dev_maps, cpu, index))
2144                        active = true;
2145        }
2146
2147        /* free map if not active */
2148        if (!active) {
2149                RCU_INIT_POINTER(dev->xps_maps, NULL);
2150                kfree_rcu(dev_maps, rcu);
2151        }
2152
2153out_no_maps:
2154        mutex_unlock(&xps_map_mutex);
2155
2156        return 0;
2157error:
2158        /* remove any maps that we added */
2159        for_each_possible_cpu(cpu) {
2160                new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2161                map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2162                                 NULL;
2163                if (new_map && new_map != map)
2164                        kfree(new_map);
2165        }
2166
2167        mutex_unlock(&xps_map_mutex);
2168
2169        kfree(new_dev_maps);
2170        return -ENOMEM;
2171}
2172EXPORT_SYMBOL(netif_set_xps_queue);
2173
2174#endif
2175/*
2176 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2177 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2178 */
2179int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2180{
2181        int rc;
2182
2183        if (txq < 1 || txq > dev->num_tx_queues)
2184                return -EINVAL;
2185
2186        if (dev->reg_state == NETREG_REGISTERED ||
2187            dev->reg_state == NETREG_UNREGISTERING) {
2188                ASSERT_RTNL();
2189
2190                rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2191                                                  txq);
2192                if (rc)
2193                        return rc;
2194
2195                if (dev->num_tc)
2196                        netif_setup_tc(dev, txq);
2197
2198                if (txq < dev->real_num_tx_queues) {
2199                        qdisc_reset_all_tx_gt(dev, txq);
2200#ifdef CONFIG_XPS
2201                        netif_reset_xps_queues_gt(dev, txq);
2202#endif
2203                }
2204        }
2205
2206        dev->real_num_tx_queues = txq;
2207        return 0;
2208}
2209EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2210
2211#ifdef CONFIG_SYSFS
2212/**
2213 *      netif_set_real_num_rx_queues - set actual number of RX queues used
2214 *      @dev: Network device
2215 *      @rxq: Actual number of RX queues
2216 *
2217 *      This must be called either with the rtnl_lock held or before
2218 *      registration of the net device.  Returns 0 on success, or a
2219 *      negative error code.  If called before registration, it always
2220 *      succeeds.
2221 */
2222int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2223{
2224        int rc;
2225
2226        if (rxq < 1 || rxq > dev->num_rx_queues)
2227                return -EINVAL;
2228
2229        if (dev->reg_state == NETREG_REGISTERED) {
2230                ASSERT_RTNL();
2231
2232                rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2233                                                  rxq);
2234                if (rc)
2235                        return rc;
2236        }
2237
2238        dev->real_num_rx_queues = rxq;
2239        return 0;
2240}
2241EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2242#endif
2243
2244/**
2245 * netif_get_num_default_rss_queues - default number of RSS queues
2246 *
2247 * This routine should set an upper limit on the number of RSS queues
2248 * used by default by multiqueue devices.
2249 */
2250int netif_get_num_default_rss_queues(void)
2251{
2252        return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2253}
2254EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2255
2256static inline void __netif_reschedule(struct Qdisc *q)
2257{
2258        struct softnet_data *sd;
2259        unsigned long flags;
2260
2261        local_irq_save(flags);
2262        sd = this_cpu_ptr(&softnet_data);
2263        q->next_sched = NULL;
2264        *sd->output_queue_tailp = q;
2265        sd->output_queue_tailp = &q->next_sched;
2266        raise_softirq_irqoff(NET_TX_SOFTIRQ);
2267        local_irq_restore(flags);
2268}
2269
2270void __netif_schedule(struct Qdisc *q)
2271{
2272        if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2273                __netif_reschedule(q);
2274}
2275EXPORT_SYMBOL(__netif_schedule);
2276
2277struct dev_kfree_skb_cb {
2278        enum skb_free_reason reason;
2279};
2280
2281static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2282{
2283        return (struct dev_kfree_skb_cb *)skb->cb;
2284}
2285
2286void netif_schedule_queue(struct netdev_queue *txq)
2287{
2288        rcu_read_lock();
2289        if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2290                struct Qdisc *q = rcu_dereference(txq->qdisc);
2291
2292                __netif_schedule(q);
2293        }
2294        rcu_read_unlock();
2295}
2296EXPORT_SYMBOL(netif_schedule_queue);
2297
2298/**
2299 *      netif_wake_subqueue - allow sending packets on subqueue
2300 *      @dev: network device
2301 *      @queue_index: sub queue index
2302 *
2303 * Resume individual transmit queue of a device with multiple transmit queues.
2304 */
2305void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2306{
2307        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2308
2309        if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2310                struct Qdisc *q;
2311
2312                rcu_read_lock();
2313                q = rcu_dereference(txq->qdisc);
2314                __netif_schedule(q);
2315                rcu_read_unlock();
2316        }
2317}
2318EXPORT_SYMBOL(netif_wake_subqueue);
2319
2320void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2321{
2322        if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2323                struct Qdisc *q;
2324
2325                rcu_read_lock();
2326                q = rcu_dereference(dev_queue->qdisc);
2327                __netif_schedule(q);
2328                rcu_read_unlock();
2329        }
2330}
2331EXPORT_SYMBOL(netif_tx_wake_queue);
2332
2333void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2334{
2335        unsigned long flags;
2336
2337        if (likely(atomic_read(&skb->users) == 1)) {
2338                smp_rmb();
2339                atomic_set(&skb->users, 0);
2340        } else if (likely(!atomic_dec_and_test(&skb->users))) {
2341                return;
2342        }
2343        get_kfree_skb_cb(skb)->reason = reason;
2344        local_irq_save(flags);
2345        skb->next = __this_cpu_read(softnet_data.completion_queue);
2346        __this_cpu_write(softnet_data.completion_queue, skb);
2347        raise_softirq_irqoff(NET_TX_SOFTIRQ);
2348        local_irq_restore(flags);
2349}
2350EXPORT_SYMBOL(__dev_kfree_skb_irq);
2351
2352void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2353{
2354        if (in_irq() || irqs_disabled())
2355                __dev_kfree_skb_irq(skb, reason);
2356        else
2357                dev_kfree_skb(skb);
2358}
2359EXPORT_SYMBOL(__dev_kfree_skb_any);
2360
2361
2362/**
2363 * netif_device_detach - mark device as removed
2364 * @dev: network device
2365 *
2366 * Mark device as removed from system and therefore no longer available.
2367 */
2368void netif_device_detach(struct net_device *dev)
2369{
2370        if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2371            netif_running(dev)) {
2372                netif_tx_stop_all_queues(dev);
2373        }
2374}
2375EXPORT_SYMBOL(netif_device_detach);
2376
2377/**
2378 * netif_device_attach - mark device as attached
2379 * @dev: network device
2380 *
2381 * Mark device as attached from system and restart if needed.
2382 */
2383void netif_device_attach(struct net_device *dev)
2384{
2385        if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2386            netif_running(dev)) {
2387                netif_tx_wake_all_queues(dev);
2388                __netdev_watchdog_up(dev);
2389        }
2390}
2391EXPORT_SYMBOL(netif_device_attach);
2392
2393/*
2394 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2395 * to be used as a distribution range.
2396 */
2397u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2398                  unsigned int num_tx_queues)
2399{
2400        u32 hash;
2401        u16 qoffset = 0;
2402        u16 qcount = num_tx_queues;
2403
2404        if (skb_rx_queue_recorded(skb)) {
2405                hash = skb_get_rx_queue(skb);
2406                while (unlikely(hash >= num_tx_queues))
2407                        hash -= num_tx_queues;
2408                return hash;
2409        }
2410
2411        if (dev->num_tc) {
2412                u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2413                qoffset = dev->tc_to_txq[tc].offset;
2414                qcount = dev->tc_to_txq[tc].count;
2415        }
2416
2417        return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2418}
2419EXPORT_SYMBOL(__skb_tx_hash);
2420
2421static void skb_warn_bad_offload(const struct sk_buff *skb)
2422{
2423        static const netdev_features_t null_features = 0;
2424        struct net_device *dev = skb->dev;
2425        const char *name = "";
2426
2427        if (!net_ratelimit())
2428                return;
2429
2430        if (dev) {
2431                if (dev->dev.parent)
2432                        name = dev_driver_string(dev->dev.parent);
2433                else
2434                        name = netdev_name(dev);
2435        }
2436        WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2437             "gso_type=%d ip_summed=%d\n",
2438             name, dev ? &dev->features : &null_features,
2439             skb->sk ? &skb->sk->sk_route_caps : &null_features,
2440             skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2441             skb_shinfo(skb)->gso_type, skb->ip_summed);
2442}
2443
2444/*
2445 * Invalidate hardware checksum when packet is to be mangled, and
2446 * complete checksum manually on outgoing path.
2447 */
2448int skb_checksum_help(struct sk_buff *skb)
2449{
2450        __wsum csum;
2451        int ret = 0, offset;
2452
2453        if (skb->ip_summed == CHECKSUM_COMPLETE)
2454                goto out_set_summed;
2455
2456        if (unlikely(skb_shinfo(skb)->gso_size)) {
2457                skb_warn_bad_offload(skb);
2458                return -EINVAL;
2459        }
2460
2461        /* Before computing a checksum, we should make sure no frag could
2462         * be modified by an external entity : checksum could be wrong.
2463         */
2464        if (skb_has_shared_frag(skb)) {
2465                ret = __skb_linearize(skb);
2466                if (ret)
2467                        goto out;
2468        }
2469
2470        offset = skb_checksum_start_offset(skb);
2471        BUG_ON(offset >= skb_headlen(skb));
2472        csum = skb_checksum(skb, offset, skb->len - offset, 0);
2473
2474        offset += skb->csum_offset;
2475        BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2476
2477        if (skb_cloned(skb) &&
2478            !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2479                ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2480                if (ret)
2481                        goto out;
2482        }
2483
2484        *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2485out_set_summed:
2486        skb->ip_summed = CHECKSUM_NONE;
2487out:
2488        return ret;
2489}
2490EXPORT_SYMBOL(skb_checksum_help);
2491
2492/* skb_csum_offload_check - Driver helper function to determine if a device
2493 * with limited checksum offload capabilities is able to offload the checksum
2494 * for a given packet.
2495 *
2496 * Arguments:
2497 *   skb - sk_buff for the packet in question
2498 *   spec - contains the description of what device can offload
2499 *   csum_encapped - returns true if the checksum being offloaded is
2500 *            encpasulated. That is it is checksum for the transport header
2501 *            in the inner headers.
2502 *   checksum_help - when set indicates that helper function should
2503 *            call skb_checksum_help if offload checks fail
2504 *
2505 * Returns:
2506 *   true: Packet has passed the checksum checks and should be offloadable to
2507 *         the device (a driver may still need to check for additional
2508 *         restrictions of its device)
2509 *   false: Checksum is not offloadable. If checksum_help was set then
2510 *         skb_checksum_help was called to resolve checksum for non-GSO
2511 *         packets and when IP protocol is not SCTP
2512 */
2513bool __skb_csum_offload_chk(struct sk_buff *skb,
2514                            const struct skb_csum_offl_spec *spec,
2515                            bool *csum_encapped,
2516                            bool csum_help)
2517{
2518        struct iphdr *iph;
2519        struct ipv6hdr *ipv6;
2520        void *nhdr;
2521        int protocol;
2522        u8 ip_proto;
2523
2524        if (skb->protocol == htons(ETH_P_8021Q) ||
2525            skb->protocol == htons(ETH_P_8021AD)) {
2526                if (!spec->vlan_okay)
2527                        goto need_help;
2528        }
2529
2530        /* We check whether the checksum refers to a transport layer checksum in
2531         * the outermost header or an encapsulated transport layer checksum that
2532         * corresponds to the inner headers of the skb. If the checksum is for
2533         * something else in the packet we need help.
2534         */
2535        if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
2536                /* Non-encapsulated checksum */
2537                protocol = eproto_to_ipproto(vlan_get_protocol(skb));
2538                nhdr = skb_network_header(skb);
2539                *csum_encapped = false;
2540                if (spec->no_not_encapped)
2541                        goto need_help;
2542        } else if (skb->encapsulation && spec->encap_okay &&
2543                   skb_checksum_start_offset(skb) ==
2544                   skb_inner_transport_offset(skb)) {
2545                /* Encapsulated checksum */
2546                *csum_encapped = true;
2547                switch (skb->inner_protocol_type) {
2548                case ENCAP_TYPE_ETHER:
2549                        protocol = eproto_to_ipproto(skb->inner_protocol);
2550                        break;
2551                case ENCAP_TYPE_IPPROTO:
2552                        protocol = skb->inner_protocol;
2553                        break;
2554                }
2555                nhdr = skb_inner_network_header(skb);
2556        } else {
2557                goto need_help;
2558        }
2559
2560        switch (protocol) {
2561        case IPPROTO_IP:
2562                if (!spec->ipv4_okay)
2563                        goto need_help;
2564                iph = nhdr;
2565                ip_proto = iph->protocol;
2566                if (iph->ihl != 5 && !spec->ip_options_okay)
2567                        goto need_help;
2568                break;
2569        case IPPROTO_IPV6:
2570                if (!spec->ipv6_okay)
2571                        goto need_help;
2572                if (spec->no_encapped_ipv6 && *csum_encapped)
2573                        goto need_help;
2574                ipv6 = nhdr;
2575                nhdr += sizeof(*ipv6);
2576                ip_proto = ipv6->nexthdr;
2577                break;
2578        default:
2579                goto need_help;
2580        }
2581
2582ip_proto_again:
2583        switch (ip_proto) {
2584        case IPPROTO_TCP:
2585                if (!spec->tcp_okay ||
2586                    skb->csum_offset != offsetof(struct tcphdr, check))
2587                        goto need_help;
2588                break;
2589        case IPPROTO_UDP:
2590                if (!spec->udp_okay ||
2591                    skb->csum_offset != offsetof(struct udphdr, check))
2592                        goto need_help;
2593                break;
2594        case IPPROTO_SCTP:
2595                if (!spec->sctp_okay ||
2596                    skb->csum_offset != offsetof(struct sctphdr, checksum))
2597                        goto cant_help;
2598                break;
2599        case NEXTHDR_HOP:
2600        case NEXTHDR_ROUTING:
2601        case NEXTHDR_DEST: {
2602                u8 *opthdr = nhdr;
2603
2604                if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
2605                        goto need_help;
2606
2607                ip_proto = opthdr[0];
2608                nhdr += (opthdr[1] + 1) << 3;
2609
2610                goto ip_proto_again;
2611        }
2612        default:
2613                goto need_help;
2614        }
2615
2616        /* Passed the tests for offloading checksum */
2617        return true;
2618
2619need_help:
2620        if (csum_help && !skb_shinfo(skb)->gso_size)
2621                skb_checksum_help(skb);
2622cant_help:
2623        return false;
2624}
2625EXPORT_SYMBOL(__skb_csum_offload_chk);
2626
2627__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2628{
2629        __be16 type = skb->protocol;
2630
2631        /* Tunnel gso handlers can set protocol to ethernet. */
2632        if (type == htons(ETH_P_TEB)) {
2633                struct ethhdr *eth;
2634
2635                if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2636                        return 0;
2637
2638                eth = (struct ethhdr *)skb_mac_header(skb);
2639                type = eth->h_proto;
2640        }
2641
2642        return __vlan_get_protocol(skb, type, depth);
2643}
2644
2645/**
2646 *      skb_mac_gso_segment - mac layer segmentation handler.
2647 *      @skb: buffer to segment
2648 *      @features: features for the output path (see dev->features)
2649 */
2650struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2651                                    netdev_features_t features)
2652{
2653        struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2654        struct packet_offload *ptype;
2655        int vlan_depth = skb->mac_len;
2656        __be16 type = skb_network_protocol(skb, &vlan_depth);
2657
2658        if (unlikely(!type))
2659                return ERR_PTR(-EINVAL);
2660
2661        __skb_pull(skb, vlan_depth);
2662
2663        rcu_read_lock();
2664        list_for_each_entry_rcu(ptype, &offload_base, list) {
2665                if (ptype->type == type && ptype->callbacks.gso_segment) {
2666                        segs = ptype->callbacks.gso_segment(skb, features);
2667                        break;
2668                }
2669        }
2670        rcu_read_unlock();
2671
2672        __skb_push(skb, skb->data - skb_mac_header(skb));
2673
2674        return segs;
2675}
2676EXPORT_SYMBOL(skb_mac_gso_segment);
2677
2678
2679/* openvswitch calls this on rx path, so we need a different check.
2680 */
2681static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2682{
2683        if (tx_path)
2684                return skb->ip_summed != CHECKSUM_PARTIAL;
2685        else
2686                return skb->ip_summed == CHECKSUM_NONE;
2687}
2688
2689/**
2690 *      __skb_gso_segment - Perform segmentation on skb.
2691 *      @skb: buffer to segment
2692 *      @features: features for the output path (see dev->features)
2693 *      @tx_path: whether it is called in TX path
2694 *
2695 *      This function segments the given skb and returns a list of segments.
2696 *
2697 *      It may return NULL if the skb requires no segmentation.  This is
2698 *      only possible when GSO is used for verifying header integrity.
2699 *
2700 *      Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2701 */
2702struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2703                                  netdev_features_t features, bool tx_path)
2704{
2705        if (unlikely(skb_needs_check(skb, tx_path))) {
2706                int err;
2707
2708                skb_warn_bad_offload(skb);
2709
2710                err = skb_cow_head(skb, 0);
2711                if (err < 0)
2712                        return ERR_PTR(err);
2713        }
2714
2715        /* Only report GSO partial support if it will enable us to
2716         * support segmentation on this frame without needing additional
2717         * work.
2718         */
2719        if (features & NETIF_F_GSO_PARTIAL) {
2720                netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2721                struct net_device *dev = skb->dev;
2722
2723                partial_features |= dev->features & dev->gso_partial_features;
2724                if (!skb_gso_ok(skb, features | partial_features))
2725                        features &= ~NETIF_F_GSO_PARTIAL;
2726        }
2727
2728        BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2729                     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2730
2731        SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2732        SKB_GSO_CB(skb)->encap_level = 0;
2733
2734        skb_reset_mac_header(skb);
2735        skb_reset_mac_len(skb);
2736
2737        return skb_mac_gso_segment(skb, features);
2738}
2739EXPORT_SYMBOL(__skb_gso_segment);
2740
2741/* Take action when hardware reception checksum errors are detected. */
2742#ifdef CONFIG_BUG
2743void netdev_rx_csum_fault(struct net_device *dev)
2744{
2745        if (net_ratelimit()) {
2746                pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2747                dump_stack();
2748        }
2749}
2750EXPORT_SYMBOL(netdev_rx_csum_fault);
2751#endif
2752
2753/* Actually, we should eliminate this check as soon as we know, that:
2754 * 1. IOMMU is present and allows to map all the memory.
2755 * 2. No high memory really exists on this machine.
2756 */
2757
2758static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2759{
2760#ifdef CONFIG_HIGHMEM
2761        int i;
2762        if (!(dev->features & NETIF_F_HIGHDMA)) {
2763                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2764                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2765                        if (PageHighMem(skb_frag_page(frag)))
2766                                return 1;
2767                }
2768        }
2769
2770        if (PCI_DMA_BUS_IS_PHYS) {
2771                struct device *pdev = dev->dev.parent;
2772
2773                if (!pdev)
2774                        return 0;
2775                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2776                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2777                        dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2778                        if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2779                                return 1;
2780                }
2781        }
2782#endif
2783        return 0;
2784}
2785
2786/* If MPLS offload request, verify we are testing hardware MPLS features
2787 * instead of standard features for the netdev.
2788 */
2789#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2790static netdev_features_t net_mpls_features(struct sk_buff *skb,
2791                                           netdev_features_t features,
2792                                           __be16 type)
2793{
2794        if (eth_p_mpls(type))
2795                features &= skb->dev->mpls_features;
2796
2797        return features;
2798}
2799#else
2800static netdev_features_t net_mpls_features(struct sk_buff *skb,
2801                                           netdev_features_t features,
2802                                           __be16 type)
2803{
2804        return features;
2805}
2806#endif
2807
2808static netdev_features_t harmonize_features(struct sk_buff *skb,
2809        netdev_features_t features)
2810{
2811        int tmp;
2812        __be16 type;
2813
2814        type = skb_network_protocol(skb, &tmp);
2815        features = net_mpls_features(skb, features, type);
2816
2817        if (skb->ip_summed != CHECKSUM_NONE &&
2818            !can_checksum_protocol(features, type)) {
2819                features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2820        } else if (illegal_highdma(skb->dev, skb)) {
2821                features &= ~NETIF_F_SG;
2822        }
2823
2824        return features;
2825}
2826
2827netdev_features_t passthru_features_check(struct sk_buff *skb,
2828                                          struct net_device *dev,
2829                                          netdev_features_t features)
2830{
2831        return features;
2832}
2833EXPORT_SYMBOL(passthru_features_check);
2834
2835static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2836                                             struct net_device *dev,
2837                                             netdev_features_t features)
2838{
2839        return vlan_features_check(skb, features);
2840}
2841
2842static netdev_features_t gso_features_check(const struct sk_buff *skb,
2843                                            struct net_device *dev,
2844                                            netdev_features_t features)
2845{
2846        u16 gso_segs = skb_shinfo(skb)->gso_segs;
2847
2848        if (gso_segs > dev->gso_max_segs)
2849                return features & ~NETIF_F_GSO_MASK;
2850
2851        /* Support for GSO partial features requires software
2852         * intervention before we can actually process the packets
2853         * so we need to strip support for any partial features now
2854         * and we can pull them back in after we have partially
2855         * segmented the frame.
2856         */
2857        if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2858                features &= ~dev->gso_partial_features;
2859
2860        /* Make sure to clear the IPv4 ID mangling feature if the
2861         * IPv4 header has the potential to be fragmented.
2862         */
2863        if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2864                struct iphdr *iph = skb->encapsulation ?
2865                                    inner_ip_hdr(skb) : ip_hdr(skb);
2866
2867                if (!(iph->frag_off & htons(IP_DF)))
2868                        features &= ~NETIF_F_TSO_MANGLEID;
2869        }
2870
2871        return features;
2872}
2873
2874netdev_features_t netif_skb_features(struct sk_buff *skb)
2875{
2876        struct net_device *dev = skb->dev;
2877        netdev_features_t features = dev->features;
2878
2879        if (skb_is_gso(skb))
2880                features = gso_features_check(skb, dev, features);
2881
2882        /* If encapsulation offload request, verify we are testing
2883         * hardware encapsulation features instead of standard
2884         * features for the netdev
2885         */
2886        if (skb->encapsulation)
2887                features &= dev->hw_enc_features;
2888
2889        if (skb_vlan_tagged(skb))
2890                features = netdev_intersect_features(features,
2891                                                     dev->vlan_features |
2892                                                     NETIF_F_HW_VLAN_CTAG_TX |
2893                                                     NETIF_F_HW_VLAN_STAG_TX);
2894
2895        if (dev->netdev_ops->ndo_features_check)
2896                features &= dev->netdev_ops->ndo_features_check(skb, dev,
2897                                                                features);
2898        else
2899                features &= dflt_features_check(skb, dev, features);
2900
2901        return harmonize_features(skb, features);
2902}
2903EXPORT_SYMBOL(netif_skb_features);
2904
2905static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2906                    struct netdev_queue *txq, bool more)
2907{
2908        unsigned int len;
2909        int rc;
2910
2911        if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2912                dev_queue_xmit_nit(skb, dev);
2913
2914        len = skb->len;
2915        trace_net_dev_start_xmit(skb, dev);
2916        rc = netdev_start_xmit(skb, dev, txq, more);
2917        trace_net_dev_xmit(skb, rc, dev, len);
2918
2919        return rc;
2920}
2921
2922struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2923                                    struct netdev_queue *txq, int *ret)
2924{
2925        struct sk_buff *skb = first;
2926        int rc = NETDEV_TX_OK;
2927
2928        while (skb) {
2929                struct sk_buff *next = skb->next;
2930
2931                skb->next = NULL;
2932                rc = xmit_one(skb, dev, txq, next != NULL);
2933                if (unlikely(!dev_xmit_complete(rc))) {
2934                        skb->next = next;
2935                        goto out;
2936                }
2937
2938                skb = next;
2939                if (netif_xmit_stopped(txq) && skb) {
2940                        rc = NETDEV_TX_BUSY;
2941                        break;
2942                }
2943        }
2944
2945out:
2946        *ret = rc;
2947        return skb;
2948}
2949
2950static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2951                                          netdev_features_t features)
2952{
2953        if (skb_vlan_tag_present(skb) &&
2954            !vlan_hw_offload_capable(features, skb->vlan_proto))
2955                skb = __vlan_hwaccel_push_inside(skb);
2956        return skb;
2957}
2958
2959static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2960{
2961        netdev_features_t features;
2962
2963        features = netif_skb_features(skb);
2964        skb = validate_xmit_vlan(skb, features);
2965        if (unlikely(!skb))
2966                goto out_null;
2967
2968        if (netif_needs_gso(skb, features)) {
2969                struct sk_buff *segs;
2970
2971                segs = skb_gso_segment(skb, features);
2972                if (IS_ERR(segs)) {
2973                        goto out_kfree_skb;
2974                } else if (segs) {
2975                        consume_skb(skb);
2976                        skb = segs;
2977                }
2978        } else {
2979                if (skb_needs_linearize(skb, features) &&
2980                    __skb_linearize(skb))
2981                        goto out_kfree_skb;
2982
2983                /* If packet is not checksummed and device does not
2984                 * support checksumming for this protocol, complete
2985                 * checksumming here.
2986                 */
2987                if (skb->ip_summed == CHECKSUM_PARTIAL) {
2988                        if (skb->encapsulation)
2989                                skb_set_inner_transport_header(skb,
2990                                                               skb_checksum_start_offset(skb));
2991                        else
2992                                skb_set_transport_header(skb,
2993                                                         skb_checksum_start_offset(skb));
2994                        if (!(features & NETIF_F_CSUM_MASK) &&
2995                            skb_checksum_help(skb))
2996                                goto out_kfree_skb;
2997                }
2998        }
2999
3000        return skb;
3001
3002out_kfree_skb:
3003        kfree_skb(skb);
3004out_null:
3005        atomic_long_inc(&dev->tx_dropped);
3006        return NULL;
3007}
3008
3009struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
3010{
3011        struct sk_buff *next, *head = NULL, *tail;
3012
3013        for (; skb != NULL; skb = next) {
3014                next = skb->next;
3015                skb->next = NULL;
3016
3017                /* in case skb wont be segmented, point to itself */
3018                skb->prev = skb;
3019
3020                skb = validate_xmit_skb(skb, dev);
3021                if (!skb)
3022                        continue;
3023
3024                if (!head)
3025                        head = skb;
3026                else
3027                        tail->next = skb;
3028                /* If skb was segmented, skb->prev points to
3029                 * the last segment. If not, it still contains skb.
3030                 */
3031                tail = skb->prev;
3032        }
3033        return head;
3034}
3035
3036static void qdisc_pkt_len_init(struct sk_buff *skb)
3037{
3038        const struct skb_shared_info *shinfo = skb_shinfo(skb);
3039
3040        qdisc_skb_cb(skb)->pkt_len = skb->len;
3041
3042        /* To get more precise estimation of bytes sent on wire,
3043         * we add to pkt_len the headers size of all segments
3044         */
3045        if (shinfo->gso_size)  {
3046                unsigned int hdr_len;
3047                u16 gso_segs = shinfo->gso_segs;
3048
3049                /* mac layer + network layer */
3050                hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3051
3052                /* + transport layer */
3053                if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3054                        hdr_len += tcp_hdrlen(skb);
3055                else
3056                        hdr_len += sizeof(struct udphdr);
3057
3058                if (shinfo->gso_type & SKB_GSO_DODGY)
3059                        gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3060                                                shinfo->gso_size);
3061
3062                qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3063        }
3064}
3065
3066static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3067                                 struct net_device *dev,
3068                                 struct netdev_queue *txq)
3069{
3070        spinlock_t *root_lock = qdisc_lock(q);
3071        bool contended;
3072        int rc;
3073
3074        qdisc_calculate_pkt_len(skb, q);
3075        /*
3076         * Heuristic to force contended enqueues to serialize on a
3077         * separate lock before trying to get qdisc main lock.
3078         * This permits __QDISC___STATE_RUNNING owner to get the lock more
3079         * often and dequeue packets faster.
3080         */
3081        contended = qdisc_is_running(q);
3082        if (unlikely(contended))
3083                spin_lock(&q->busylock);
3084
3085        spin_lock(root_lock);
3086        if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3087                kfree_skb(skb);
3088                rc = NET_XMIT_DROP;
3089        } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3090                   qdisc_run_begin(q)) {
3091                /*
3092                 * This is a work-conserving queue; there are no old skbs
3093                 * waiting to be sent out; and the qdisc is not running -
3094                 * xmit the skb directly.
3095                 */
3096
3097                qdisc_bstats_update(q, skb);
3098
3099                if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3100                        if (unlikely(contended)) {
3101                                spin_unlock(&q->busylock);
3102                                contended = false;
3103                        }
3104                        __qdisc_run(q);
3105                } else
3106                        qdisc_run_end(q);
3107
3108                rc = NET_XMIT_SUCCESS;
3109        } else {
3110                rc = q->enqueue(skb, q) & NET_XMIT_MASK;
3111                if (qdisc_run_begin(q)) {
3112                        if (unlikely(contended)) {
3113                                spin_unlock(&q->busylock);
3114                                contended = false;
3115                        }
3116                        __qdisc_run(q);
3117                }
3118        }
3119        spin_unlock(root_lock);
3120        if (unlikely(contended))
3121                spin_unlock(&q->busylock);
3122        return rc;
3123}
3124
3125#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3126static void skb_update_prio(struct sk_buff *skb)
3127{
3128        struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3129
3130        if (!skb->priority && skb->sk && map) {
3131                unsigned int prioidx =
3132                        sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3133
3134                if (prioidx < map->priomap_len)
3135                        skb->priority = map->priomap[prioidx];
3136        }
3137}
3138#else
3139#define skb_update_prio(skb)
3140#endif
3141
3142DEFINE_PER_CPU(int, xmit_recursion);
3143EXPORT_SYMBOL(xmit_recursion);
3144
3145#define RECURSION_LIMIT 10
3146
3147/**
3148 *      dev_loopback_xmit - loop back @skb
3149 *      @net: network namespace this loopback is happening in
3150 *      @sk:  sk needed to be a netfilter okfn
3151 *      @skb: buffer to transmit
3152 */
3153int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3154{
3155        skb_reset_mac_header(skb);
3156        __skb_pull(skb, skb_network_offset(skb));
3157        skb->pkt_type = PACKET_LOOPBACK;
3158        skb->ip_summed = CHECKSUM_UNNECESSARY;
3159        WARN_ON(!skb_dst(skb));
3160        skb_dst_force(skb);
3161        netif_rx_ni(skb);
3162        return 0;
3163}
3164EXPORT_SYMBOL(dev_loopback_xmit);
3165
3166#ifdef CONFIG_NET_EGRESS
3167static struct sk_buff *
3168sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3169{
3170        struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3171        struct tcf_result cl_res;
3172
3173        if (!cl)
3174                return skb;
3175
3176        /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3177         * earlier by the caller.
3178         */
3179        qdisc_bstats_cpu_update(cl->q, skb);
3180
3181        switch (tc_classify(skb, cl, &cl_res, false)) {
3182        case TC_ACT_OK:
3183        case TC_ACT_RECLASSIFY:
3184                skb->tc_index = TC_H_MIN(cl_res.classid);
3185                break;
3186        case TC_ACT_SHOT:
3187                qdisc_qstats_cpu_drop(cl->q);
3188                *ret = NET_XMIT_DROP;
3189                kfree_skb(skb);
3190                return NULL;
3191        case TC_ACT_STOLEN:
3192        case TC_ACT_QUEUED:
3193                *ret = NET_XMIT_SUCCESS;
3194                consume_skb(skb);
3195                return NULL;
3196        case TC_ACT_REDIRECT:
3197                /* No need to push/pop skb's mac_header here on egress! */
3198                skb_do_redirect(skb);
3199                *ret = NET_XMIT_SUCCESS;
3200                return NULL;
3201        default:
3202                break;
3203        }
3204
3205        return skb;
3206}
3207#endif /* CONFIG_NET_EGRESS */
3208
3209static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3210{
3211#ifdef CONFIG_XPS
3212        struct xps_dev_maps *dev_maps;
3213        struct xps_map *map;
3214        int queue_index = -1;
3215
3216        rcu_read_lock();
3217        dev_maps = rcu_dereference(dev->xps_maps);
3218        if (dev_maps) {
3219                map = rcu_dereference(
3220                    dev_maps->cpu_map[skb->sender_cpu - 1]);
3221                if (map) {
3222                        if (map->len == 1)
3223                                queue_index = map->queues[0];
3224                        else
3225                                queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3226                                                                           map->len)];
3227                        if (unlikely(queue_index >= dev->real_num_tx_queues))
3228                                queue_index = -1;
3229                }
3230        }
3231        rcu_read_unlock();
3232
3233        return queue_index;
3234#else
3235        return -1;
3236#endif
3237}
3238
3239static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3240{
3241        struct sock *sk = skb->sk;
3242        int queue_index = sk_tx_queue_get(sk);
3243
3244        if (queue_index < 0 || skb->ooo_okay ||
3245            queue_index >= dev->real_num_tx_queues) {
3246                int new_index = get_xps_queue(dev, skb);
3247                if (new_index < 0)
3248                        new_index = skb_tx_hash(dev, skb);
3249
3250                if (queue_index != new_index && sk &&
3251                    sk_fullsock(sk) &&
3252                    rcu_access_pointer(sk->sk_dst_cache))
3253                        sk_tx_queue_set(sk, new_index);
3254
3255                queue_index = new_index;
3256        }
3257
3258        return queue_index;
3259}
3260
3261struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3262                                    struct sk_buff *skb,
3263                                    void *accel_priv)
3264{
3265        int queue_index = 0;
3266
3267#ifdef CONFIG_XPS
3268        u32 sender_cpu = skb->sender_cpu - 1;
3269
3270        if (sender_cpu >= (u32)NR_CPUS)
3271                skb->sender_cpu = raw_smp_processor_id() + 1;
3272#endif
3273
3274        if (dev->real_num_tx_queues != 1) {
3275                const struct net_device_ops *ops = dev->netdev_ops;
3276                if (ops->ndo_select_queue)
3277                        queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3278                                                            __netdev_pick_tx);
3279                else
3280                        queue_index = __netdev_pick_tx(dev, skb);
3281
3282                if (!accel_priv)
3283                        queue_index = netdev_cap_txqueue(dev, queue_index);
3284        }
3285
3286        skb_set_queue_mapping(skb, queue_index);
3287        return netdev_get_tx_queue(dev, queue_index);
3288}
3289
3290/**
3291 *      __dev_queue_xmit - transmit a buffer
3292 *      @skb: buffer to transmit
3293 *      @accel_priv: private data used for L2 forwarding offload
3294 *
3295 *      Queue a buffer for transmission to a network device. The caller must
3296 *      have set the device and priority and built the buffer before calling
3297 *      this function. The function can be called from an interrupt.
3298 *
3299 *      A negative errno code is returned on a failure. A success does not
3300 *      guarantee the frame will be transmitted as it may be dropped due
3301 *      to congestion or traffic shaping.
3302 *
3303 * -----------------------------------------------------------------------------------
3304 *      I notice this method can also return errors from the queue disciplines,
3305 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3306 *      be positive.
3307 *
3308 *      Regardless of the return value, the skb is consumed, so it is currently
3309 *      difficult to retry a send to this method.  (You can bump the ref count
3310 *      before sending to hold a reference for retry if you are careful.)
3311 *
3312 *      When calling this method, interrupts MUST be enabled.  This is because
3313 *      the BH enable code must have IRQs enabled so that it will not deadlock.
3314 *          --BLG
3315 */
3316static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3317{
3318        struct net_device *dev = skb->dev;
3319        struct netdev_queue *txq;
3320        struct Qdisc *q;
3321        int rc = -ENOMEM;
3322
3323        skb_reset_mac_header(skb);
3324
3325        if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3326                __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3327
3328        /* Disable soft irqs for various locks below. Also
3329         * stops preemption for RCU.
3330         */
3331        rcu_read_lock_bh();
3332
3333        skb_update_prio(skb);
3334
3335        qdisc_pkt_len_init(skb);
3336#ifdef CONFIG_NET_CLS_ACT
3337        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3338# ifdef CONFIG_NET_EGRESS
3339        if (static_key_false(&egress_needed)) {
3340                skb = sch_handle_egress(skb, &rc, dev);
3341                if (!skb)
3342                        goto out;
3343        }
3344# endif
3345#endif
3346        /* If device/qdisc don't need skb->dst, release it right now while
3347         * its hot in this cpu cache.
3348         */
3349        if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3350                skb_dst_drop(skb);
3351        else
3352                skb_dst_force(skb);
3353
3354#ifdef CONFIG_NET_SWITCHDEV
3355        /* Don't forward if offload device already forwarded */
3356        if (skb->offload_fwd_mark &&
3357            skb->offload_fwd_mark == dev->offload_fwd_mark) {
3358                consume_skb(skb);
3359                rc = NET_XMIT_SUCCESS;
3360                goto out;
3361        }
3362#endif
3363
3364        txq = netdev_pick_tx(dev, skb, accel_priv);
3365        q = rcu_dereference_bh(txq->qdisc);
3366
3367        trace_net_dev_queue(skb);
3368        if (q->enqueue) {
3369                rc = __dev_xmit_skb(skb, q, dev, txq);
3370                goto out;
3371        }
3372
3373        /* The device has no queue. Common case for software devices:
3374           loopback, all the sorts of tunnels...
3375
3376           Really, it is unlikely that netif_tx_lock protection is necessary
3377           here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3378           counters.)
3379           However, it is possible, that they rely on protection
3380           made by us here.
3381
3382           Check this and shot the lock. It is not prone from deadlocks.
3383           Either shot noqueue qdisc, it is even simpler 8)
3384         */
3385        if (dev->flags & IFF_UP) {
3386                int cpu = smp_processor_id(); /* ok because BHs are off */
3387
3388                if (txq->xmit_lock_owner != cpu) {
3389
3390                        if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
3391                                goto recursion_alert;
3392
3393                        skb = validate_xmit_skb(skb, dev);
3394                        if (!skb)
3395                                goto out;
3396
3397                        HARD_TX_LOCK(dev, txq, cpu);
3398
3399                        if (!netif_xmit_stopped(txq)) {
3400                                __this_cpu_inc(xmit_recursion);
3401                                skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3402                                __this_cpu_dec(xmit_recursion);
3403                                if (dev_xmit_complete(rc)) {
3404                                        HARD_TX_UNLOCK(dev, txq);
3405                                        goto out;
3406                                }
3407                        }
3408                        HARD_TX_UNLOCK(dev, txq);
3409                        net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3410                                             dev->name);
3411                } else {
3412                        /* Recursion is detected! It is possible,
3413                         * unfortunately
3414                         */
3415recursion_alert:
3416                        net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3417                                             dev->name);
3418                }
3419        }
3420
3421        rc = -ENETDOWN;
3422        rcu_read_unlock_bh();
3423
3424        atomic_long_inc(&dev->tx_dropped);
3425        kfree_skb_list(skb);
3426        return rc;
3427out:
3428        rcu_read_unlock_bh();
3429        return rc;
3430}
3431
3432int dev_queue_xmit(struct sk_buff *skb)
3433{
3434        return __dev_queue_xmit(skb, NULL);
3435}
3436EXPORT_SYMBOL(dev_queue_xmit);
3437
3438int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3439{
3440        return __dev_queue_xmit(skb, accel_priv);
3441}
3442EXPORT_SYMBOL(dev_queue_xmit_accel);
3443
3444
3445/*=======================================================================
3446                        Receiver routines
3447  =======================================================================*/
3448
3449int netdev_max_backlog __read_mostly = 1000;
3450EXPORT_SYMBOL(netdev_max_backlog);
3451
3452int netdev_tstamp_prequeue __read_mostly = 1;
3453int netdev_budget __read_mostly = 300;
3454int weight_p __read_mostly = 64;            /* old backlog weight */
3455
3456/* Called with irq disabled */
3457static inline void ____napi_schedule(struct softnet_data *sd,
3458                                     struct napi_struct *napi)
3459{
3460        list_add_tail(&napi->poll_list, &sd->poll_list);
3461        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3462}
3463
3464#ifdef CONFIG_RPS
3465
3466/* One global table that all flow-based protocols share. */
3467struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3468EXPORT_SYMBOL(rps_sock_flow_table);
3469u32 rps_cpu_mask __read_mostly;
3470EXPORT_SYMBOL(rps_cpu_mask);
3471
3472struct static_key rps_needed __read_mostly;
3473EXPORT_SYMBOL(rps_needed);
3474
3475static struct rps_dev_flow *
3476set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3477            struct rps_dev_flow *rflow, u16 next_cpu)
3478{
3479        if (next_cpu < nr_cpu_ids) {
3480#ifdef CONFIG_RFS_ACCEL
3481                struct netdev_rx_queue *rxqueue;
3482                struct rps_dev_flow_table *flow_table;
3483                struct rps_dev_flow *old_rflow;
3484                u32 flow_id;
3485                u16 rxq_index;
3486                int rc;
3487
3488                /* Should we steer this flow to a different hardware queue? */
3489                if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3490                    !(dev->features & NETIF_F_NTUPLE))
3491                        goto out;
3492                rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3493                if (rxq_index == skb_get_rx_queue(skb))
3494                        goto out;
3495
3496                rxqueue = dev->_rx + rxq_index;
3497                flow_table = rcu_dereference(rxqueue->rps_flow_table);
3498                if (!flow_table)
3499                        goto out;
3500                flow_id = skb_get_hash(skb) & flow_table->mask;
3501                rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3502                                                        rxq_index, flow_id);
3503                if (rc < 0)
3504                        goto out;
3505                old_rflow = rflow;
3506                rflow = &flow_table->flows[flow_id];
3507                rflow->filter = rc;
3508                if (old_rflow->filter == rflow->filter)
3509                        old_rflow->filter = RPS_NO_FILTER;
3510        out:
3511#endif
3512                rflow->last_qtail =
3513                        per_cpu(softnet_data, next_cpu).input_queue_head;
3514        }
3515
3516        rflow->cpu = next_cpu;
3517        return rflow;
3518}
3519
3520/*
3521 * get_rps_cpu is called from netif_receive_skb and returns the target
3522 * CPU from the RPS map of the receiving queue for a given skb.
3523 * rcu_read_lock must be held on entry.
3524 */
3525static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3526                       struct rps_dev_flow **rflowp)
3527{
3528        const struct rps_sock_flow_table *sock_flow_table;
3529        struct netdev_rx_queue *rxqueue = dev->_rx;
3530        struct rps_dev_flow_table *flow_table;
3531        struct rps_map *map;
3532        int cpu = -1;
3533        u32 tcpu;
3534        u32 hash;
3535
3536        if (skb_rx_queue_recorded(skb)) {
3537                u16 index = skb_get_rx_queue(skb);
3538
3539                if (unlikely(index >= dev->real_num_rx_queues)) {
3540                        WARN_ONCE(dev->real_num_rx_queues > 1,
3541                                  "%s received packet on queue %u, but number "
3542                                  "of RX queues is %u\n",
3543                                  dev->name, index, dev->real_num_rx_queues);
3544                        goto done;
3545                }
3546                rxqueue += index;
3547        }
3548
3549        /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3550
3551        flow_table = rcu_dereference(rxqueue->rps_flow_table);
3552        map = rcu_dereference(rxqueue->rps_map);
3553        if (!flow_table && !map)
3554                goto done;
3555
3556        skb_reset_network_header(skb);
3557        hash = skb_get_hash(skb);
3558        if (!hash)
3559                goto done;
3560
3561        sock_flow_table = rcu_dereference(rps_sock_flow_table);
3562        if (flow_table && sock_flow_table) {
3563                struct rps_dev_flow *rflow;
3564                u32 next_cpu;
3565                u32 ident;
3566
3567                /* First check into global flow table if there is a match */
3568                ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3569                if ((ident ^ hash) & ~rps_cpu_mask)
3570                        goto try_rps;
3571
3572                next_cpu = ident & rps_cpu_mask;
3573
3574                /* OK, now we know there is a match,
3575                 * we can look at the local (per receive queue) flow table
3576                 */
3577                rflow = &flow_table->flows[hash & flow_table->mask];
3578                tcpu = rflow->cpu;
3579
3580                /*
3581                 * If the desired CPU (where last recvmsg was done) is
3582                 * different from current CPU (one in the rx-queue flow
3583                 * table entry), switch if one of the following holds:
3584                 *   - Current CPU is unset (>= nr_cpu_ids).
3585                 *   - Current CPU is offline.
3586                 *   - The current CPU's queue tail has advanced beyond the
3587                 *     last packet that was enqueued using this table entry.
3588                 *     This guarantees that all previous packets for the flow
3589                 *     have been dequeued, thus preserving in order delivery.
3590                 */
3591                if (unlikely(tcpu != next_cpu) &&
3592                    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3593                     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3594                      rflow->last_qtail)) >= 0)) {
3595                        tcpu = next_cpu;
3596                        rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3597                }
3598
3599                if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3600                        *rflowp = rflow;
3601                        cpu = tcpu;
3602                        goto done;
3603                }
3604        }
3605
3606try_rps:
3607
3608        if (map) {
3609                tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3610                if (cpu_online(tcpu)) {
3611                        cpu = tcpu;
3612                        goto done;
3613                }
3614        }
3615
3616done:
3617        return cpu;
3618}
3619
3620#ifdef CONFIG_RFS_ACCEL
3621
3622/**
3623 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3624 * @dev: Device on which the filter was set
3625 * @rxq_index: RX queue index
3626 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3627 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3628 *
3629 * Drivers that implement ndo_rx_flow_steer() should periodically call
3630 * this function for each installed filter and remove the filters for
3631 * which it returns %true.
3632 */
3633bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3634                         u32 flow_id, u16 filter_id)
3635{
3636        struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3637        struct rps_dev_flow_table *flow_table;
3638        struct rps_dev_flow *rflow;
3639        bool expire = true;
3640        unsigned int cpu;
3641
3642        rcu_read_lock();
3643        flow_table = rcu_dereference(rxqueue->rps_flow_table);
3644        if (flow_table && flow_id <= flow_table->mask) {
3645                rflow = &flow_table->flows[flow_id];
3646                cpu = ACCESS_ONCE(rflow->cpu);
3647                if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3648                    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3649                           rflow->last_qtail) <
3650                     (int)(10 * flow_table->mask)))
3651                        expire = false;
3652        }
3653        rcu_read_unlock();
3654        return expire;
3655}
3656EXPORT_SYMBOL(rps_may_expire_flow);
3657
3658#endif /* CONFIG_RFS_ACCEL */
3659
3660/* Called from hardirq (IPI) context */
3661static void rps_trigger_softirq(void *data)
3662{
3663        struct softnet_data *sd = data;
3664
3665        ____napi_schedule(sd, &sd->backlog);
3666        sd->received_rps++;
3667}
3668
3669#endif /* CONFIG_RPS */
3670
3671/*
3672 * Check if this softnet_data structure is another cpu one
3673 * If yes, queue it to our IPI list and return 1
3674 * If no, return 0
3675 */
3676static int rps_ipi_queued(struct softnet_data *sd)
3677{
3678#ifdef CONFIG_RPS
3679        struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3680
3681        if (sd != mysd) {
3682                sd->rps_ipi_next = mysd->rps_ipi_list;
3683                mysd->rps_ipi_list = sd;
3684
3685                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3686                return 1;
3687        }
3688#endif /* CONFIG_RPS */
3689        return 0;
3690}
3691
3692#ifdef CONFIG_NET_FLOW_LIMIT
3693int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3694#endif
3695
3696static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3697{
3698#ifdef CONFIG_NET_FLOW_LIMIT
3699        struct sd_flow_limit *fl;
3700        struct softnet_data *sd;
3701        unsigned int old_flow, new_flow;
3702
3703        if (qlen < (netdev_max_backlog >> 1))
3704                return false;
3705
3706        sd = this_cpu_ptr(&softnet_data);
3707
3708        rcu_read_lock();
3709        fl = rcu_dereference(sd->flow_limit);
3710        if (fl) {
3711                new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3712                old_flow = fl->history[fl->history_head];
3713                fl->history[fl->history_head] = new_flow;
3714
3715                fl->history_head++;
3716                fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3717
3718                if (likely(fl->buckets[old_flow]))
3719                        fl->buckets[old_flow]--;
3720
3721                if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3722                        fl->count++;
3723                        rcu_read_unlock();
3724                        return true;
3725                }
3726        }
3727        rcu_read_unlock();
3728#endif
3729        return false;
3730}
3731
3732/*
3733 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3734 * queue (may be a remote CPU queue).
3735 */
3736static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3737                              unsigned int *qtail)
3738{
3739        struct softnet_data *sd;
3740        unsigned long flags;
3741        unsigned int qlen;
3742
3743        sd = &per_cpu(softnet_data, cpu);
3744
3745        local_irq_save(flags);
3746
3747        rps_lock(sd);
3748        if (!netif_running(skb->dev))
3749                goto drop;
3750        qlen = skb_queue_len(&sd->input_pkt_queue);
3751        if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3752                if (qlen) {
3753enqueue:
3754                        __skb_queue_tail(&sd->input_pkt_queue, skb);
3755                        input_queue_tail_incr_save(sd, qtail);
3756                        rps_unlock(sd);
3757                        local_irq_restore(flags);
3758                        return NET_RX_SUCCESS;
3759                }
3760
3761                /* Schedule NAPI for backlog device
3762                 * We can use non atomic operation since we own the queue lock
3763                 */
3764                if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3765                        if (!rps_ipi_queued(sd))
3766                                ____napi_schedule(sd, &sd->backlog);
3767                }
3768                goto enqueue;
3769        }
3770
3771drop:
3772        sd->dropped++;
3773        rps_unlock(sd);
3774
3775        local_irq_restore(flags);
3776
3777        atomic_long_inc(&skb->dev->rx_dropped);
3778        kfree_skb(skb);
3779        return NET_RX_DROP;
3780}
3781
3782static int netif_rx_internal(struct sk_buff *skb)
3783{
3784        int ret;
3785
3786        net_timestamp_check(netdev_tstamp_prequeue, skb);
3787
3788        trace_netif_rx(skb);
3789#ifdef CONFIG_RPS
3790        if (static_key_false(&rps_needed)) {
3791                struct rps_dev_flow voidflow, *rflow = &voidflow;
3792                int cpu;
3793
3794                preempt_disable();
3795                rcu_read_lock();
3796
3797                cpu = get_rps_cpu(skb->dev, skb, &rflow);
3798                if (cpu < 0)
3799                        cpu = smp_processor_id();
3800
3801                ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3802
3803                rcu_read_unlock();
3804                preempt_enable();
3805        } else
3806#endif
3807        {
3808                unsigned int qtail;
3809                ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3810                put_cpu();
3811        }
3812        return ret;
3813}
3814
3815/**
3816 *      netif_rx        -       post buffer to the network code
3817 *      @skb: buffer to post
3818 *
3819 *      This function receives a packet from a device driver and queues it for
3820 *      the upper (protocol) levels to process.  It always succeeds. The buffer
3821 *      may be dropped during processing for congestion control or by the
3822 *      protocol layers.
3823 *
3824 *      return values:
3825 *      NET_RX_SUCCESS  (no congestion)
3826 *      NET_RX_DROP     (packet was dropped)
3827 *
3828 */
3829
3830int netif_rx(struct sk_buff *skb)
3831{
3832        trace_netif_rx_entry(skb);
3833
3834        return netif_rx_internal(skb);
3835}
3836EXPORT_SYMBOL(netif_rx);
3837
3838int netif_rx_ni(struct sk_buff *skb)
3839{
3840        int err;
3841
3842        trace_netif_rx_ni_entry(skb);
3843
3844        preempt_disable();
3845        err = netif_rx_internal(skb);
3846        if (local_softirq_pending())
3847                do_softirq();
3848        preempt_enable();
3849
3850        return err;
3851}
3852EXPORT_SYMBOL(netif_rx_ni);
3853
3854static void net_tx_action(struct softirq_action *h)
3855{
3856        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3857
3858        if (sd->completion_queue) {
3859                struct sk_buff *clist;
3860
3861                local_irq_disable();
3862                clist = sd->completion_queue;
3863                sd->completion_queue = NULL;
3864                local_irq_enable();
3865
3866                while (clist) {
3867                        struct sk_buff *skb = clist;
3868                        clist = clist->next;
3869
3870                        WARN_ON(atomic_read(&skb->users));
3871                        if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3872                                trace_consume_skb(skb);
3873                        else
3874                                trace_kfree_skb(skb, net_tx_action);
3875
3876                        if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3877                                __kfree_skb(skb);
3878                        else
3879                                __kfree_skb_defer(skb);
3880                }
3881
3882                __kfree_skb_flush();
3883        }
3884
3885        if (sd->output_queue) {
3886                struct Qdisc *head;
3887
3888                local_irq_disable();
3889                head = sd->output_queue;
3890                sd->output_queue = NULL;
3891                sd->output_queue_tailp = &sd->output_queue;
3892                local_irq_enable();
3893
3894                while (head) {
3895                        struct Qdisc *q = head;
3896                        spinlock_t *root_lock;
3897
3898                        head = head->next_sched;
3899
3900                        root_lock = qdisc_lock(q);
3901                        if (spin_trylock(root_lock)) {
3902                                smp_mb__before_atomic();
3903                                clear_bit(__QDISC_STATE_SCHED,
3904                                          &q->state);
3905                                qdisc_run(q);
3906                                spin_unlock(root_lock);
3907                        } else {
3908                                if (!test_bit(__QDISC_STATE_DEACTIVATED,
3909                                              &q->state)) {
3910                                        __netif_reschedule(q);
3911                                } else {
3912                                        smp_mb__before_atomic();
3913                                        clear_bit(__QDISC_STATE_SCHED,
3914                                                  &q->state);
3915                                }
3916                        }
3917                }
3918        }
3919}
3920
3921#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3922    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3923/* This hook is defined here for ATM LANE */
3924int (*br_fdb_test_addr_hook)(struct net_device *dev,
3925                             unsigned char *addr) __read_mostly;
3926EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3927#endif
3928
3929static inline struct sk_buff *
3930sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3931                   struct net_device *orig_dev)
3932{
3933#ifdef CONFIG_NET_CLS_ACT
3934        struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3935        struct tcf_result cl_res;
3936
3937        /* If there's at least one ingress present somewhere (so
3938         * we get here via enabled static key), remaining devices
3939         * that are not configured with an ingress qdisc will bail
3940         * out here.
3941         */
3942        if (!cl)
3943                return skb;
3944        if (*pt_prev) {
3945                *ret = deliver_skb(skb, *pt_prev, orig_dev);
3946                *pt_prev = NULL;
3947        }
3948
3949        qdisc_skb_cb(skb)->pkt_len = skb->len;
3950        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3951        qdisc_bstats_cpu_update(cl->q, skb);
3952
3953        switch (tc_classify(skb, cl, &cl_res, false)) {
3954        case TC_ACT_OK:
3955        case TC_ACT_RECLASSIFY:
3956                skb->tc_index = TC_H_MIN(cl_res.classid);
3957                break;
3958        case TC_ACT_SHOT:
3959                qdisc_qstats_cpu_drop(cl->q);
3960                kfree_skb(skb);
3961                return NULL;
3962        case TC_ACT_STOLEN:
3963        case TC_ACT_QUEUED:
3964                consume_skb(skb);
3965                return NULL;
3966        case TC_ACT_REDIRECT:
3967                /* skb_mac_header check was done by cls/act_bpf, so
3968                 * we can safely push the L2 header back before
3969                 * redirecting to another netdev
3970                 */
3971                __skb_push(skb, skb->mac_len);
3972                skb_do_redirect(skb);
3973                return NULL;
3974        default:
3975                break;
3976        }
3977#endif /* CONFIG_NET_CLS_ACT */
3978        return skb;
3979}
3980
3981/**
3982 *      netdev_is_rx_handler_busy - check if receive handler is registered
3983 *      @dev: device to check
3984 *
3985 *      Check if a receive handler is already registered for a given device.
3986 *      Return true if there one.
3987 *
3988 *      The caller must hold the rtnl_mutex.
3989 */
3990bool netdev_is_rx_handler_busy(struct net_device *dev)
3991{
3992        ASSERT_RTNL();
3993        return dev && rtnl_dereference(dev->rx_handler);
3994}
3995EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3996
3997/**
3998 *      netdev_rx_handler_register - register receive handler
3999 *      @dev: device to register a handler for
4000 *      @rx_handler: receive handler to register
4001 *      @rx_handler_data: data pointer that is used by rx handler
4002 *
4003 *      Register a receive handler for a device. This handler will then be
4004 *      called from __netif_receive_skb. A negative errno code is returned
4005 *      on a failure.
4006 *
4007 *      The caller must hold the rtnl_mutex.
4008 *
4009 *      For a general description of rx_handler, see enum rx_handler_result.
4010 */
4011int netdev_rx_handler_register(struct net_device *dev,
4012                               rx_handler_func_t *rx_handler,
4013                               void *rx_handler_data)
4014{
4015        ASSERT_RTNL();
4016
4017        if (dev->rx_handler)
4018                return -EBUSY;
4019
4020        /* Note: rx_handler_data must be set before rx_handler */
4021        rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4022        rcu_assign_pointer(dev->rx_handler, rx_handler);
4023
4024        return 0;
4025}
4026EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4027
4028/**
4029 *      netdev_rx_handler_unregister - unregister receive handler
4030 *      @dev: device to unregister a handler from
4031 *
4032 *      Unregister a receive handler from a device.
4033 *
4034 *      The caller must hold the rtnl_mutex.
4035 */
4036void netdev_rx_handler_unregister(struct net_device *dev)
4037{
4038
4039        ASSERT_RTNL();
4040        RCU_INIT_POINTER(dev->rx_handler, NULL);
4041        /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4042         * section has a guarantee to see a non NULL rx_handler_data
4043         * as well.
4044         */
4045        synchronize_net();
4046        RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4047}
4048EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4049
4050/*
4051 * Limit the use of PFMEMALLOC reserves to those protocols that implement
4052 * the special handling of PFMEMALLOC skbs.
4053 */
4054static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4055{
4056        switch (skb->protocol) {
4057        case htons(ETH_P_ARP):
4058        case htons(ETH_P_IP):
4059        case htons(ETH_P_IPV6):
4060        case htons(ETH_P_8021Q):
4061        case htons(ETH_P_8021AD):
4062                return true;
4063        default:
4064                return false;
4065        }
4066}
4067
4068static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4069                             int *ret, struct net_device *orig_dev)
4070{
4071#ifdef CONFIG_NETFILTER_INGRESS
4072        if (nf_hook_ingress_active(skb)) {
4073                if (*pt_prev) {
4074                        *ret = deliver_skb(skb, *pt_prev, orig_dev);
4075                        *pt_prev = NULL;
4076                }
4077
4078                return nf_hook_ingress(skb);
4079        }
4080#endif /* CONFIG_NETFILTER_INGRESS */
4081        return 0;
4082}
4083
4084static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4085{
4086        struct packet_type *ptype, *pt_prev;
4087        rx_handler_func_t *rx_handler;
4088        struct net_device *orig_dev;
4089        bool deliver_exact = false;
4090        int ret = NET_RX_DROP;
4091        __be16 type;
4092
4093        net_timestamp_check(!netdev_tstamp_prequeue, skb);
4094
4095        trace_netif_receive_skb(skb);
4096
4097        orig_dev = skb->dev;
4098
4099        skb_reset_network_header(skb);
4100        if (!skb_transport_header_was_set(skb))
4101                skb_reset_transport_header(skb);
4102        skb_reset_mac_len(skb);
4103
4104        pt_prev = NULL;
4105
4106another_round:
4107        skb->skb_iif = skb->dev->ifindex;
4108
4109        __this_cpu_inc(softnet_data.processed);
4110
4111        if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4112            skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4113                skb = skb_vlan_untag(skb);
4114                if (unlikely(!skb))
4115                        goto out;
4116        }
4117
4118#ifdef CONFIG_NET_CLS_ACT
4119        if (skb->tc_verd & TC_NCLS) {
4120                skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4121                goto ncls;
4122        }
4123#endif
4124
4125        if (pfmemalloc)
4126                goto skip_taps;
4127
4128        list_for_each_entry_rcu(ptype, &ptype_all, list) {
4129                if (pt_prev)
4130                        ret = deliver_skb(skb, pt_prev, orig_dev);
4131                pt_prev = ptype;
4132        }
4133
4134        list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4135                if (pt_prev)
4136                        ret = deliver_skb(skb, pt_prev, orig_dev);
4137                pt_prev = ptype;
4138        }
4139
4140skip_taps:
4141#ifdef CONFIG_NET_INGRESS
4142        if (static_key_false(&ingress_needed)) {
4143                skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4144                if (!skb)
4145                        goto out;
4146
4147                if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4148                        goto out;
4149        }
4150#endif
4151#ifdef CONFIG_NET_CLS_ACT
4152        skb->tc_verd = 0;
4153ncls:
4154#endif
4155        if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4156                goto drop;
4157
4158        if (skb_vlan_tag_present(skb)) {
4159                if (pt_prev) {
4160                        ret = deliver_skb(skb, pt_prev, orig_dev);
4161                        pt_prev = NULL;
4162                }
4163                if (vlan_do_receive(&skb))
4164                        goto another_round;
4165                else if (unlikely(!skb))
4166                        goto out;
4167        }
4168
4169        rx_handler = rcu_dereference(skb->dev->rx_handler);
4170        if (rx_handler) {
4171                if (pt_prev) {
4172                        ret = deliver_skb(skb, pt_prev, orig_dev);
4173                        pt_prev = NULL;
4174                }
4175                switch (rx_handler(&skb)) {
4176                case RX_HANDLER_CONSUMED:
4177                        ret = NET_RX_SUCCESS;
4178                        goto out;
4179                case RX_HANDLER_ANOTHER:
4180                        goto another_round;
4181                case RX_HANDLER_EXACT:
4182                        deliver_exact = true;
4183                case RX_HANDLER_PASS:
4184                        break;
4185                default:
4186                        BUG();
4187                }
4188        }
4189
4190        if (unlikely(skb_vlan_tag_present(skb))) {
4191                if (skb_vlan_tag_get_id(skb))
4192                        skb->pkt_type = PACKET_OTHERHOST;
4193                /* Note: we might in the future use prio bits
4194                 * and set skb->priority like in vlan_do_receive()
4195                 * For the time being, just ignore Priority Code Point
4196                 */
4197                skb->vlan_tci = 0;
4198        }
4199
4200        type = skb->protocol;
4201
4202        /* deliver only exact match when indicated */
4203        if (likely(!deliver_exact)) {
4204                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4205                                       &ptype_base[ntohs(type) &
4206                                                   PTYPE_HASH_MASK]);
4207        }
4208
4209        deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4210                               &orig_dev->ptype_specific);
4211
4212        if (unlikely(skb->dev != orig_dev)) {
4213                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4214                                       &skb->dev->ptype_specific);
4215        }
4216
4217        if (pt_prev) {
4218                if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4219                        goto drop;
4220                else
4221                        ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4222        } else {
4223drop:
4224                if (!deliver_exact)
4225                        atomic_long_inc(&skb->dev->rx_dropped);
4226                else
4227                        atomic_long_inc(&skb->dev->rx_nohandler);
4228                kfree_skb(skb);
4229                /* Jamal, now you will not able to escape explaining
4230                 * me how you were going to use this. :-)
4231                 */
4232                ret = NET_RX_DROP;
4233        }
4234
4235out:
4236        return ret;
4237}
4238
4239static int __netif_receive_skb(struct sk_buff *skb)
4240{
4241        int ret;
4242
4243        if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4244                unsigned long pflags = current->flags;
4245
4246                /*
4247                 * PFMEMALLOC skbs are special, they should
4248                 * - be delivered to SOCK_MEMALLOC sockets only
4249                 * - stay away from userspace
4250                 * - have bounded memory usage
4251                 *
4252                 * Use PF_MEMALLOC as this saves us from propagating the allocation
4253                 * context down to all allocation sites.
4254                 */
4255                current->flags |= PF_MEMALLOC;
4256                ret = __netif_receive_skb_core(skb, true);
4257                tsk_restore_flags(current, pflags, PF_MEMALLOC);
4258        } else
4259                ret = __netif_receive_skb_core(skb, false);
4260
4261        return ret;
4262}
4263
4264static int netif_receive_skb_internal(struct sk_buff *skb)
4265{
4266        int ret;
4267
4268        net_timestamp_check(netdev_tstamp_prequeue, skb);
4269
4270        if (skb_defer_rx_timestamp(skb))
4271                return NET_RX_SUCCESS;
4272
4273        rcu_read_lock();
4274
4275#ifdef CONFIG_RPS
4276        if (static_key_false(&rps_needed)) {
4277                struct rps_dev_flow voidflow, *rflow = &voidflow;
4278                int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4279
4280                if (cpu >= 0) {
4281                        ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4282                        rcu_read_unlock();
4283                        return ret;
4284                }
4285        }
4286#endif
4287        ret = __netif_receive_skb(skb);
4288        rcu_read_unlock();
4289        return ret;
4290}
4291
4292/**
4293 *      netif_receive_skb - process receive buffer from network
4294 *      @skb: buffer to process
4295 *
4296 *      netif_receive_skb() is the main receive data processing function.
4297 *      It always succeeds. The buffer may be dropped during processing
4298 *      for congestion control or by the protocol layers.
4299 *
4300 *      This function may only be called from softirq context and interrupts
4301 *      should be enabled.
4302 *
4303 *      Return values (usually ignored):
4304 *      NET_RX_SUCCESS: no congestion
4305 *      NET_RX_DROP: packet was dropped
4306 */
4307int netif_receive_skb(struct sk_buff *skb)
4308{
4309        trace_netif_receive_skb_entry(skb);
4310
4311        return netif_receive_skb_internal(skb);
4312}
4313EXPORT_SYMBOL(netif_receive_skb);
4314
4315/* Network device is going away, flush any packets still pending
4316 * Called with irqs disabled.
4317 */
4318static void flush_backlog(void *arg)
4319{
4320        struct net_device *dev = arg;
4321        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4322        struct sk_buff *skb, *tmp;
4323
4324        rps_lock(sd);
4325        skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4326                if (skb->dev == dev) {
4327                        __skb_unlink(skb, &sd->input_pkt_queue);
4328                        kfree_skb(skb);
4329                        input_queue_head_incr(sd);
4330                }
4331        }
4332        rps_unlock(sd);
4333
4334        skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4335                if (skb->dev == dev) {
4336                        __skb_unlink(skb, &sd->process_queue);
4337                        kfree_skb(skb);
4338                        input_queue_head_incr(sd);
4339                }
4340        }
4341}
4342
4343static int napi_gro_complete(struct sk_buff *skb)
4344{
4345        struct packet_offload *ptype;
4346        __be16 type = skb->protocol;
4347        struct list_head *head = &offload_base;
4348        int err = -ENOENT;
4349
4350        BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4351
4352        if (NAPI_GRO_CB(skb)->count == 1) {
4353                skb_shinfo(skb)->gso_size = 0;
4354                goto out;
4355        }
4356
4357        rcu_read_lock();
4358        list_for_each_entry_rcu(ptype, head, list) {
4359                if (ptype->type != type || !ptype->callbacks.gro_complete)
4360                        continue;
4361
4362                err = ptype->callbacks.gro_complete(skb, 0);
4363                break;
4364        }
4365        rcu_read_unlock();
4366
4367        if (err) {
4368                WARN_ON(&ptype->list == head);
4369                kfree_skb(skb);
4370                return NET_RX_SUCCESS;
4371        }
4372
4373out:
4374        return netif_receive_skb_internal(skb);
4375}
4376
4377/* napi->gro_list contains packets ordered by age.
4378 * youngest packets at the head of it.
4379 * Complete skbs in reverse order to reduce latencies.
4380 */
4381void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4382{
4383        struct sk_buff *skb, *prev = NULL;
4384
4385        /* scan list and build reverse chain */
4386        for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4387                skb->prev = prev;
4388                prev = skb;
4389        }
4390
4391        for (skb = prev; skb; skb = prev) {
4392                skb->next = NULL;
4393
4394                if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4395                        return;
4396
4397                prev = skb->prev;
4398                napi_gro_complete(skb);
4399                napi->gro_count--;
4400        }
4401
4402        napi->gro_list = NULL;
4403}
4404EXPORT_SYMBOL(napi_gro_flush);
4405
4406static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4407{
4408        struct sk_buff *p;
4409        unsigned int maclen = skb->dev->hard_header_len;
4410        u32 hash = skb_get_hash_raw(skb);
4411
4412        for (p = napi->gro_list; p; p = p->next) {
4413                unsigned long diffs;
4414
4415                NAPI_GRO_CB(p)->flush = 0;
4416
4417                if (hash != skb_get_hash_raw(p)) {
4418                        NAPI_GRO_CB(p)->same_flow = 0;
4419                        continue;
4420                }
4421
4422                diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4423                diffs |= p->vlan_tci ^ skb->vlan_tci;
4424                diffs |= skb_metadata_dst_cmp(p, skb);
4425                if (maclen == ETH_HLEN)
4426                        diffs |= compare_ether_header(skb_mac_header(p),
4427                                                      skb_mac_header(skb));
4428                else if (!diffs)
4429                        diffs = memcmp(skb_mac_header(p),
4430                                       skb_mac_header(skb),
4431                                       maclen);
4432                NAPI_GRO_CB(p)->same_flow = !diffs;
4433        }
4434}
4435
4436static void skb_gro_reset_offset(struct sk_buff *skb)
4437{
4438        const struct skb_shared_info *pinfo = skb_shinfo(skb);
4439        const skb_frag_t *frag0 = &pinfo->frags[0];
4440
4441        NAPI_GRO_CB(skb)->data_offset = 0;
4442        NAPI_GRO_CB(skb)->frag0 = NULL;
4443        NAPI_GRO_CB(skb)->frag0_len = 0;
4444
4445        if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4446            pinfo->nr_frags &&
4447            !PageHighMem(skb_frag_page(frag0))) {
4448                NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4449                NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
4450        }
4451}
4452
4453static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4454{
4455        struct skb_shared_info *pinfo = skb_shinfo(skb);
4456
4457        BUG_ON(skb->end - skb->tail < grow);
4458
4459        memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4460
4461        skb->data_len -= grow;
4462        skb->tail += grow;
4463
4464        pinfo->frags[0].page_offset += grow;
4465        skb_frag_size_sub(&pinfo->frags[0], grow);
4466
4467        if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4468                skb_frag_unref(skb, 0);
4469                memmove(pinfo->frags, pinfo->frags + 1,
4470                        --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4471        }
4472}
4473
4474static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4475{
4476        struct sk_buff **pp = NULL;
4477        struct packet_offload *ptype;
4478        __be16 type = skb->protocol;
4479        struct list_head *head = &offload_base;
4480        int same_flow;
4481        enum gro_result ret;
4482        int grow;
4483
4484        if (!(skb->dev->features & NETIF_F_GRO))
4485                goto normal;
4486
4487        if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4488                goto normal;
4489
4490        gro_list_prepare(napi, skb);
4491
4492        rcu_read_lock();
4493        list_for_each_entry_rcu(ptype, head, list) {
4494                if (ptype->type != type || !ptype->callbacks.gro_receive)
4495                        continue;
4496
4497                skb_set_network_header(skb, skb_gro_offset(skb));
4498                skb_reset_mac_len(skb);
4499                NAPI_GRO_CB(skb)->same_flow = 0;
4500                NAPI_GRO_CB(skb)->flush = 0;
4501                NAPI_GRO_CB(skb)->free = 0;
4502                NAPI_GRO_CB(skb)->encap_mark = 0;
4503                NAPI_GRO_CB(skb)->is_fou = 0;
4504                NAPI_GRO_CB(skb)->is_atomic = 1;
4505                NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4506
4507                /* Setup for GRO checksum validation */
4508                switch (skb->ip_summed) {
4509                case CHECKSUM_COMPLETE:
4510                        NAPI_GRO_CB(skb)->csum = skb->csum;
4511                        NAPI_GRO_CB(skb)->csum_valid = 1;
4512                        NAPI_GRO_CB(skb)->csum_cnt = 0;
4513                        break;
4514                case CHECKSUM_UNNECESSARY:
4515                        NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4516                        NAPI_GRO_CB(skb)->csum_valid = 0;
4517                        break;
4518                default:
4519                        NAPI_GRO_CB(skb)->csum_cnt = 0;
4520                        NAPI_GRO_CB(skb)->csum_valid = 0;
4521                }
4522
4523                pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4524                break;
4525        }
4526        rcu_read_unlock();
4527
4528        if (&ptype->list == head)
4529                goto normal;
4530
4531        same_flow = NAPI_GRO_CB(skb)->same_flow;
4532        ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4533
4534        if (pp) {
4535                struct sk_buff *nskb = *pp;
4536
4537                *pp = nskb->next;
4538                nskb->next = NULL;
4539                napi_gro_complete(nskb);
4540                napi->gro_count--;
4541        }
4542
4543        if (same_flow)
4544                goto ok;
4545
4546        if (NAPI_GRO_CB(skb)->flush)
4547                goto normal;
4548
4549        if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4550                struct sk_buff *nskb = napi->gro_list;
4551
4552                /* locate the end of the list to select the 'oldest' flow */
4553                while (nskb->next) {
4554                        pp = &nskb->next;
4555                        nskb = *pp;
4556                }
4557                *pp = NULL;
4558                nskb->next = NULL;
4559                napi_gro_complete(nskb);
4560        } else {
4561                napi->gro_count++;
4562        }
4563        NAPI_GRO_CB(skb)->count = 1;
4564        NAPI_GRO_CB(skb)->age = jiffies;
4565        NAPI_GRO_CB(skb)->last = skb;
4566        skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4567        skb->next = napi->gro_list;
4568        napi->gro_list = skb;
4569        ret = GRO_HELD;
4570
4571pull:
4572        grow = skb_gro_offset(skb) - skb_headlen(skb);
4573        if (grow > 0)
4574                gro_pull_from_frag0(skb, grow);
4575ok:
4576        return ret;
4577
4578normal:
4579        ret = GRO_NORMAL;
4580        goto pull;
4581}
4582
4583struct packet_offload *gro_find_receive_by_type(__be16 type)
4584{
4585        struct list_head *offload_head = &offload_base;
4586        struct packet_offload *ptype;
4587
4588        list_for_each_entry_rcu(ptype, offload_head, list) {
4589                if (ptype->type != type || !ptype->callbacks.gro_receive)
4590                        continue;
4591                return ptype;
4592        }
4593        return NULL;
4594}
4595EXPORT_SYMBOL(gro_find_receive_by_type);
4596
4597struct packet_offload *gro_find_complete_by_type(__be16 type)
4598{
4599        struct list_head *offload_head = &offload_base;
4600        struct packet_offload *ptype;
4601
4602        list_for_each_entry_rcu(ptype, offload_head, list) {
4603                if (ptype->type != type || !ptype->callbacks.gro_complete)
4604                        continue;
4605                return ptype;
4606        }
4607        return NULL;
4608}
4609EXPORT_SYMBOL(gro_find_complete_by_type);
4610
4611static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4612{
4613        switch (ret) {
4614        case GRO_NORMAL:
4615                if (netif_receive_skb_internal(skb))
4616                        ret = GRO_DROP;
4617                break;
4618
4619        case GRO_DROP:
4620                kfree_skb(skb);
4621                break;
4622
4623        case GRO_MERGED_FREE:
4624                if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4625                        skb_dst_drop(skb);
4626                        kmem_cache_free(skbuff_head_cache, skb);
4627                } else {
4628                        __kfree_skb(skb);
4629                }
4630                break;
4631
4632        case GRO_HELD:
4633        case GRO_MERGED:
4634                break;
4635        }
4636
4637        return ret;
4638}
4639
4640gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4641{
4642        skb_mark_napi_id(skb, napi);
4643        trace_napi_gro_receive_entry(skb);
4644
4645        skb_gro_reset_offset(skb);
4646
4647        return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4648}
4649EXPORT_SYMBOL(napi_gro_receive);
4650
4651static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4652{
4653        if (unlikely(skb->pfmemalloc)) {
4654                consume_skb(skb);
4655                return;
4656        }
4657        __skb_pull(skb, skb_headlen(skb));
4658        /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4659        skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4660        skb->vlan_tci = 0;
4661        skb->dev = napi->dev;
4662        skb->skb_iif = 0;
4663        skb->encapsulation = 0;
4664        skb_shinfo(skb)->gso_type = 0;
4665        skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4666
4667        napi->skb = skb;
4668}
4669
4670struct sk_buff *napi_get_frags(struct napi_struct *napi)
4671{
4672        struct sk_buff *skb = napi->skb;
4673
4674        if (!skb) {
4675                skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4676                if (skb) {
4677                        napi->skb = skb;
4678                        skb_mark_napi_id(skb, napi);
4679                }
4680        }
4681        return skb;
4682}
4683EXPORT_SYMBOL(napi_get_frags);
4684
4685static gro_result_t napi_frags_finish(struct napi_struct *napi,
4686                                      struct sk_buff *skb,
4687                                      gro_result_t ret)
4688{
4689        switch (ret) {
4690        case GRO_NORMAL:
4691        case GRO_HELD:
4692                __skb_push(skb, ETH_HLEN);
4693                skb->protocol = eth_type_trans(skb, skb->dev);
4694                if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4695                        ret = GRO_DROP;
4696                break;
4697
4698        case GRO_DROP:
4699        case GRO_MERGED_FREE:
4700                napi_reuse_skb(napi, skb);
4701                break;
4702
4703        case GRO_MERGED:
4704                break;
4705        }
4706
4707        return ret;
4708}
4709
4710/* Upper GRO stack assumes network header starts at gro_offset=0
4711 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4712 * We copy ethernet header into skb->data to have a common layout.
4713 */
4714static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4715{
4716        struct sk_buff *skb = napi->skb;
4717        const struct ethhdr *eth;
4718        unsigned int hlen = sizeof(*eth);
4719
4720        napi->skb = NULL;
4721
4722        skb_reset_mac_header(skb);
4723        skb_gro_reset_offset(skb);
4724
4725        eth = skb_gro_header_fast(skb, 0);
4726        if (unlikely(skb_gro_header_hard(skb, hlen))) {
4727                eth = skb_gro_header_slow(skb, hlen, 0);
4728                if (unlikely(!eth)) {
4729                        net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4730                                             __func__, napi->dev->name);
4731                        napi_reuse_skb(napi, skb);
4732                        return NULL;
4733                }
4734        } else {
4735                gro_pull_from_frag0(skb, hlen);
4736                NAPI_GRO_CB(skb)->frag0 += hlen;
4737                NAPI_GRO_CB(skb)->frag0_len -= hlen;
4738        }
4739        __skb_pull(skb, hlen);
4740
4741        /*
4742         * This works because the only protocols we care about don't require
4743         * special handling.
4744         * We'll fix it up properly in napi_frags_finish()
4745         */
4746        skb->protocol = eth->h_proto;
4747
4748        return skb;
4749}
4750
4751gro_result_t napi_gro_frags(struct napi_struct *napi)
4752{
4753        struct sk_buff *skb = napi_frags_skb(napi);
4754
4755        if (!skb)
4756                return GRO_DROP;
4757
4758        trace_napi_gro_frags_entry(skb);
4759
4760        return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4761}
4762EXPORT_SYMBOL(napi_gro_frags);
4763
4764/* Compute the checksum from gro_offset and return the folded value
4765 * after adding in any pseudo checksum.
4766 */
4767__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4768{
4769        __wsum wsum;
4770        __sum16 sum;
4771
4772        wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4773
4774        /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4775        sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4776        if (likely(!sum)) {
4777                if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4778                    !skb->csum_complete_sw)
4779                        netdev_rx_csum_fault(skb->dev);
4780        }
4781
4782        NAPI_GRO_CB(skb)->csum = wsum;
4783        NAPI_GRO_CB(skb)->csum_valid = 1;
4784
4785        return sum;
4786}
4787EXPORT_SYMBOL(__skb_gro_checksum_complete);
4788
4789/*
4790 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4791 * Note: called with local irq disabled, but exits with local irq enabled.
4792 */
4793static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4794{
4795#ifdef CONFIG_RPS
4796        struct softnet_data *remsd = sd->rps_ipi_list;
4797
4798        if (remsd) {
4799                sd->rps_ipi_list = NULL;
4800
4801                local_irq_enable();
4802
4803                /* Send pending IPI's to kick RPS processing on remote cpus. */
4804                while (remsd) {
4805                        struct softnet_data *next = remsd->rps_ipi_next;
4806
4807                        if (cpu_online(remsd->cpu))
4808                                smp_call_function_single_async(remsd->cpu,
4809                                                           &remsd->csd);
4810                        remsd = next;
4811                }
4812        } else
4813#endif
4814                local_irq_enable();
4815}
4816
4817static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4818{
4819#ifdef CONFIG_RPS
4820        return sd->rps_ipi_list != NULL;
4821#else
4822        return false;
4823#endif
4824}
4825
4826static int process_backlog(struct napi_struct *napi, int quota)
4827{
4828        int work = 0;
4829        struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4830
4831        /* Check if we have pending ipi, its better to send them now,
4832         * not waiting net_rx_action() end.
4833         */
4834        if (sd_has_rps_ipi_waiting(sd)) {
4835                local_irq_disable();
4836                net_rps_action_and_irq_enable(sd);
4837        }
4838
4839        napi->weight = weight_p;
4840        local_irq_disable();
4841        while (1) {
4842                struct sk_buff *skb;
4843
4844                while ((skb = __skb_dequeue(&sd->process_queue))) {
4845                        rcu_read_lock();
4846                        local_irq_enable();
4847                        __netif_receive_skb(skb);
4848                        rcu_read_unlock();
4849                        local_irq_disable();
4850                        input_queue_head_incr(sd);
4851                        if (++work >= quota) {
4852                                local_irq_enable();
4853                                return work;
4854                        }
4855                }
4856
4857                rps_lock(sd);
4858                if (skb_queue_empty(&sd->input_pkt_queue)) {
4859                        /*
4860                         * Inline a custom version of __napi_complete().
4861                         * only current cpu owns and manipulates this napi,
4862                         * and NAPI_STATE_SCHED is the only possible flag set
4863                         * on backlog.
4864                         * We can use a plain write instead of clear_bit(),
4865                         * and we dont need an smp_mb() memory barrier.
4866                         */
4867                        napi->state = 0;
4868                        rps_unlock(sd);
4869
4870                        break;
4871                }
4872
4873                skb_queue_splice_tail_init(&sd->input_pkt_queue,
4874                                           &sd->process_queue);
4875                rps_unlock(sd);
4876        }
4877        local_irq_enable();
4878
4879        return work;
4880}
4881
4882/**
4883 * __napi_schedule - schedule for receive
4884 * @n: entry to schedule
4885 *
4886 * The entry's receive function will be scheduled to run.
4887 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4888 */
4889void __napi_schedule(struct napi_struct *n)
4890{
4891        unsigned long flags;
4892
4893        local_irq_save(flags);
4894        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4895        local_irq_restore(flags);
4896}
4897EXPORT_SYMBOL(__napi_schedule);
4898
4899/**
4900 * __napi_schedule_irqoff - schedule for receive
4901 * @n: entry to schedule
4902 *
4903 * Variant of __napi_schedule() assuming hard irqs are masked
4904 */
4905void __napi_schedule_irqoff(struct napi_struct *n)
4906{
4907        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4908}
4909EXPORT_SYMBOL(__napi_schedule_irqoff);
4910
4911void __napi_complete(struct napi_struct *n)
4912{
4913        BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4914
4915        list_del_init(&n->poll_list);
4916        smp_mb__before_atomic();
4917        clear_bit(NAPI_STATE_SCHED, &n->state);
4918}
4919EXPORT_SYMBOL(__napi_complete);
4920
4921void napi_complete_done(struct napi_struct *n, int work_done)
4922{
4923        unsigned long flags;
4924
4925        /*
4926         * don't let napi dequeue from the cpu poll list
4927         * just in case its running on a different cpu
4928         */
4929        if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4930                return;
4931
4932        if (n->gro_list) {
4933                unsigned long timeout = 0;
4934
4935                if (work_done)
4936                        timeout = n->dev->gro_flush_timeout;
4937
4938                if (timeout)
4939                        hrtimer_start(&n->timer, ns_to_ktime(timeout),
4940                                      HRTIMER_MODE_REL_PINNED);
4941                else
4942                        napi_gro_flush(n, false);
4943        }
4944        if (likely(list_empty(&n->poll_list))) {
4945                WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4946        } else {
4947                /* If n->poll_list is not empty, we need to mask irqs */
4948                local_irq_save(flags);
4949                __napi_complete(n);
4950                local_irq_restore(flags);
4951        }
4952}
4953EXPORT_SYMBOL(napi_complete_done);
4954
4955/* must be called under rcu_read_lock(), as we dont take a reference */
4956static struct napi_struct *napi_by_id(unsigned int napi_id)
4957{
4958        unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4959        struct napi_struct *napi;
4960
4961        hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4962                if (napi->napi_id == napi_id)
4963                        return napi;
4964
4965        return NULL;
4966}
4967
4968#if defined(CONFIG_NET_RX_BUSY_POLL)
4969#define BUSY_POLL_BUDGET 8
4970bool sk_busy_loop(struct sock *sk, int nonblock)
4971{
4972        unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
4973        int (*busy_poll)(struct napi_struct *dev);
4974        struct napi_struct *napi;
4975        int rc = false;
4976
4977        rcu_read_lock();
4978
4979        napi = napi_by_id(sk->sk_napi_id);
4980        if (!napi)
4981                goto out;
4982
4983        /* Note: ndo_busy_poll method is optional in linux-4.5 */
4984        busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
4985
4986        do {
4987                rc = 0;
4988                local_bh_disable();
4989                if (busy_poll) {
4990                        rc = busy_poll(napi);
4991                } else if (napi_schedule_prep(napi)) {
4992                        void *have = netpoll_poll_lock(napi);
4993
4994                        if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
4995                                rc = napi->poll(napi, BUSY_POLL_BUDGET);
4996                                trace_napi_poll(napi);
4997                                if (rc == BUSY_POLL_BUDGET) {
4998                                        napi_complete_done(napi, rc);
4999                                        napi_schedule(napi);
5000                                }
5001                        }
5002                        netpoll_poll_unlock(have);
5003                }
5004                if (rc > 0)
5005                        __NET_ADD_STATS(sock_net(sk),
5006                                        LINUX_MIB_BUSYPOLLRXPACKETS, rc);
5007                local_bh_enable();
5008
5009                if (rc == LL_FLUSH_FAILED)
5010                        break; /* permanent failure */
5011
5012                cpu_relax();
5013        } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
5014                 !need_resched() && !busy_loop_timeout(end_time));
5015
5016        rc = !skb_queue_empty(&sk->sk_receive_queue);
5017out:
5018        rcu_read_unlock();
5019        return rc;
5020}
5021EXPORT_SYMBOL(sk_busy_loop);
5022
5023#endif /* CONFIG_NET_RX_BUSY_POLL */
5024
5025void napi_hash_add(struct napi_struct *napi)
5026{
5027        if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5028            test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5029                return;
5030
5031        spin_lock(&napi_hash_lock);
5032
5033        /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5034        do {
5035                if (unlikely(++napi_gen_id < NR_CPUS + 1))
5036                        napi_gen_id = NR_CPUS + 1;
5037        } while (napi_by_id(napi_gen_id));
5038        napi->napi_id = napi_gen_id;
5039
5040        hlist_add_head_rcu(&napi->napi_hash_node,
5041                           &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5042
5043        spin_unlock(&napi_hash_lock);
5044}
5045EXPORT_SYMBOL_GPL(napi_hash_add);
5046
5047/* Warning : caller is responsible to make sure rcu grace period
5048 * is respected before freeing memory containing @napi
5049 */
5050bool napi_hash_del(struct napi_struct *napi)
5051{
5052        bool rcu_sync_needed = false;
5053
5054        spin_lock(&napi_hash_lock);
5055
5056        if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5057                rcu_sync_needed = true;
5058                hlist_del_rcu(&napi->napi_hash_node);
5059        }
5060        spin_unlock(&napi_hash_lock);
5061        return rcu_sync_needed;
5062}
5063EXPORT_SYMBOL_GPL(napi_hash_del);
5064
5065static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5066{
5067        struct napi_struct *napi;
5068
5069        napi = container_of(timer, struct napi_struct, timer);
5070        if (napi->gro_list)
5071                napi_schedule(napi);
5072
5073        return HRTIMER_NORESTART;
5074}
5075
5076void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5077                    int (*poll)(struct napi_struct *, int), int weight)
5078{
5079        INIT_LIST_HEAD(&napi->poll_list);
5080        hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5081        napi->timer.function = napi_watchdog;
5082        napi->gro_count = 0;
5083        napi->gro_list = NULL;
5084        napi->skb = NULL;
5085        napi->poll = poll;
5086        if (weight > NAPI_POLL_WEIGHT)
5087                pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5088                            weight, dev->name);
5089        napi->weight = weight;
5090        list_add(&napi->dev_list, &dev->napi_list);
5091        napi->dev = dev;
5092#ifdef CONFIG_NETPOLL
5093        spin_lock_init(&napi->poll_lock);
5094        napi->poll_owner = -1;
5095#endif
5096        set_bit(NAPI_STATE_SCHED, &napi->state);
5097        napi_hash_add(napi);
5098}
5099EXPORT_SYMBOL(netif_napi_add);
5100
5101void napi_disable(struct napi_struct *n)
5102{
5103        might_sleep();
5104        set_bit(NAPI_STATE_DISABLE, &n->state);
5105
5106        while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5107                msleep(1);
5108        while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5109                msleep(1);
5110
5111        hrtimer_cancel(&n->timer);
5112
5113        clear_bit(NAPI_STATE_DISABLE, &n->state);
5114}
5115EXPORT_SYMBOL(napi_disable);
5116
5117/* Must be called in process context */
5118void netif_napi_del(struct napi_struct *napi)
5119{
5120        might_sleep();
5121        if (napi_hash_del(napi))
5122                synchronize_net();
5123        list_del_init(&napi->dev_list);
5124        napi_free_frags(napi);
5125
5126        kfree_skb_list(napi->gro_list);
5127        napi->gro_list = NULL;
5128        napi->gro_count = 0;
5129}
5130EXPORT_SYMBOL(netif_napi_del);
5131
5132static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5133{
5134        void *have;
5135        int work, weight;
5136
5137        list_del_init(&n->poll_list);
5138
5139        have = netpoll_poll_lock(n);
5140
5141        weight = n->weight;
5142
5143        /* This NAPI_STATE_SCHED test is for avoiding a race
5144         * with netpoll's poll_napi().  Only the entity which
5145         * obtains the lock and sees NAPI_STATE_SCHED set will
5146         * actually make the ->poll() call.  Therefore we avoid
5147         * accidentally calling ->poll() when NAPI is not scheduled.
5148         */
5149        work = 0;
5150        if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5151                work = n->poll(n, weight);
5152                trace_napi_poll(n);
5153        }
5154
5155        WARN_ON_ONCE(work > weight);
5156
5157        if (likely(work < weight))
5158                goto out_unlock;
5159
5160        /* Drivers must not modify the NAPI state if they
5161         * consume the entire weight.  In such cases this code
5162         * still "owns" the NAPI instance and therefore can
5163         * move the instance around on the list at-will.
5164         */
5165        if (unlikely(napi_disable_pending(n))) {
5166                napi_complete(n);
5167                goto out_unlock;
5168        }
5169
5170        if (n->gro_list) {
5171                /* flush too old packets
5172                 * If HZ < 1000, flush all packets.
5173                 */
5174                napi_gro_flush(n, HZ >= 1000);
5175        }
5176
5177        /* Some drivers may have called napi_schedule
5178         * prior to exhausting their budget.
5179         */
5180        if (unlikely(!list_empty(&n->poll_list))) {
5181                pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5182                             n->dev ? n->dev->name : "backlog");
5183                goto out_unlock;
5184        }
5185
5186        list_add_tail(&n->poll_list, repoll);
5187
5188out_unlock:
5189        netpoll_poll_unlock(have);
5190
5191        return work;
5192}
5193
5194static void net_rx_action(struct softirq_action *h)
5195{
5196        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5197        unsigned long time_limit = jiffies + 2;
5198        int budget = netdev_budget;
5199        LIST_HEAD(list);
5200        LIST_HEAD(repoll);
5201
5202        local_irq_disable();
5203        list_splice_init(&sd->poll_list, &list);
5204        local_irq_enable();
5205
5206        for (;;) {
5207                struct napi_struct *n;
5208
5209                if (list_empty(&list)) {
5210                        if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5211                                return;
5212                        break;
5213                }
5214
5215                n = list_first_entry(&list, struct napi_struct, poll_list);
5216                budget -= napi_poll(n, &repoll);
5217
5218                /* If softirq window is exhausted then punt.
5219                 * Allow this to run for 2 jiffies since which will allow
5220                 * an average latency of 1.5/HZ.
5221                 */
5222                if (unlikely(budget <= 0 ||
5223                             time_after_eq(jiffies, time_limit))) {
5224                        sd->time_squeeze++;
5225                        break;
5226                }
5227        }
5228
5229        __kfree_skb_flush();
5230        local_irq_disable();
5231
5232        list_splice_tail_init(&sd->poll_list, &list);
5233        list_splice_tail(&repoll, &list);
5234        list_splice(&list, &sd->poll_list);
5235        if (!list_empty(&sd->poll_list))
5236                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
5237
5238        net_rps_action_and_irq_enable(sd);
5239}
5240
5241struct netdev_adjacent {
5242        struct net_device *dev;
5243
5244        /* upper master flag, there can only be one master device per list */
5245        bool master;
5246
5247        /* counter for the number of times this device was added to us */
5248        u16 ref_nr;
5249
5250        /* private field for the users */
5251        void *private;
5252
5253        struct list_head list;
5254        struct rcu_head rcu;
5255};
5256
5257static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5258                                                 struct list_head *adj_list)
5259{
5260        struct netdev_adjacent *adj;
5261
5262        list_for_each_entry(adj, adj_list, list) {
5263                if (adj->dev == adj_dev)
5264                        return adj;
5265        }
5266        return NULL;
5267}
5268
5269/**
5270 * netdev_has_upper_dev - Check if device is linked to an upper device
5271 * @dev: device
5272 * @upper_dev: upper device to check
5273 *
5274 * Find out if a device is linked to specified upper device and return true
5275 * in case it is. Note that this checks only immediate upper device,
5276 * not through a complete stack of devices. The caller must hold the RTNL lock.
5277 */
5278bool netdev_has_upper_dev(struct net_device *dev,
5279                          struct net_device *upper_dev)
5280{
5281        ASSERT_RTNL();
5282
5283        return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
5284}
5285EXPORT_SYMBOL(netdev_has_upper_dev);
5286
5287/**
5288 * netdev_has_any_upper_dev - Check if device is linked to some device
5289 * @dev: device
5290 *
5291 * Find out if a device is linked to an upper device and return true in case
5292 * it is. The caller must hold the RTNL lock.
5293 */
5294static bool netdev_has_any_upper_dev(struct net_device *dev)
5295{
5296        ASSERT_RTNL();
5297
5298        return !list_empty(&dev->all_adj_list.upper);
5299}
5300
5301/**
5302 * netdev_master_upper_dev_get - Get master upper device
5303 * @dev: device
5304 *
5305 * Find a master upper device and return pointer to it or NULL in case
5306 * it's not there. The caller must hold the RTNL lock.
5307 */
5308struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5309{
5310        struct netdev_adjacent *upper;
5311
5312        ASSERT_RTNL();
5313
5314        if (list_empty(&dev->adj_list.upper))
5315                return NULL;
5316
5317        upper = list_first_entry(&dev->adj_list.upper,
5318                                 struct netdev_adjacent, list);
5319        if (likely(upper->master))
5320                return upper->dev;
5321        return NULL;
5322}
5323EXPORT_SYMBOL(netdev_master_upper_dev_get);
5324
5325void *netdev_adjacent_get_private(struct list_head *adj_list)
5326{
5327        struct netdev_adjacent *adj;
5328
5329        adj = list_entry(adj_list, struct netdev_adjacent, list);
5330
5331        return adj->private;
5332}
5333EXPORT_SYMBOL(netdev_adjacent_get_private);
5334
5335/**
5336 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5337 * @dev: device
5338 * @iter: list_head ** of the current position
5339 *
5340 * Gets the next device from the dev's upper list, starting from iter
5341 * position. The caller must hold RCU read lock.
5342 */
5343struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5344                                                 struct list_head **iter)
5345{
5346        struct netdev_adjacent *upper;
5347
5348        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5349
5350        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5351
5352        if (&upper->list == &dev->adj_list.upper)
5353                return NULL;
5354
5355        *iter = &upper->list;
5356
5357        return upper->dev;
5358}
5359EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5360
5361/**
5362 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
5363 * @dev: device
5364 * @iter: list_head ** of the current position
5365 *
5366 * Gets the next device from the dev's upper list, starting from iter
5367 * position. The caller must hold RCU read lock.
5368 */
5369struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5370                                                     struct list_head **iter)
5371{
5372        struct netdev_adjacent *upper;
5373
5374        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5375
5376        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5377
5378        if (&upper->list == &dev->all_adj_list.upper)
5379                return NULL;
5380
5381        *iter = &upper->list;
5382
5383        return upper->dev;
5384}
5385EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
5386
5387/**
5388 * netdev_lower_get_next_private - Get the next ->private from the
5389 *                                 lower neighbour list
5390 * @dev: device
5391 * @iter: list_head ** of the current position
5392 *
5393 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5394 * list, starting from iter position. The caller must hold either hold the
5395 * RTNL lock or its own locking that guarantees that the neighbour lower
5396 * list will remain unchanged.
5397 */
5398void *netdev_lower_get_next_private(struct net_device *dev,
5399                                    struct list_head **iter)
5400{
5401        struct netdev_adjacent *lower;
5402
5403        lower = list_entry(*iter, struct netdev_adjacent, list);
5404
5405        if (&lower->list == &dev->adj_list.lower)
5406                return NULL;
5407
5408        *iter = lower->list.next;
5409
5410        return lower->private;
5411}
5412EXPORT_SYMBOL(netdev_lower_get_next_private);
5413
5414/**
5415 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5416 *                                     lower neighbour list, RCU
5417 *                                     variant
5418 * @dev: device
5419 * @iter: list_head ** of the current position
5420 *
5421 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5422 * list, starting from iter position. The caller must hold RCU read lock.
5423 */
5424void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5425                                        struct list_head **iter)
5426{
5427        struct netdev_adjacent *lower;
5428
5429        WARN_ON_ONCE(!rcu_read_lock_held());
5430
5431        lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5432
5433        if (&lower->list == &dev->adj_list.lower)
5434                return NULL;
5435
5436        *iter = &lower->list;
5437
5438        return lower->private;
5439}
5440EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5441
5442/**
5443 * netdev_lower_get_next - Get the next device from the lower neighbour
5444 *                         list
5445 * @dev: device
5446 * @iter: list_head ** of the current position
5447 *
5448 * Gets the next netdev_adjacent from the dev's lower neighbour
5449 * list, starting from iter position. The caller must hold RTNL lock or
5450 * its own locking that guarantees that the neighbour lower
5451 * list will remain unchanged.
5452 */
5453void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5454{
5455        struct netdev_adjacent *lower;
5456
5457        lower = list_entry(*iter, struct netdev_adjacent, list);
5458
5459        if (&lower->list == &dev->adj_list.lower)
5460                return NULL;
5461
5462        *iter = lower->list.next;
5463
5464        return lower->dev;
5465}
5466EXPORT_SYMBOL(netdev_lower_get_next);
5467
5468/**
5469 * netdev_lower_get_first_private_rcu - Get the first ->private from the
5470 *                                     lower neighbour list, RCU
5471 *                                     variant
5472 * @dev: device
5473 *
5474 * Gets the first netdev_adjacent->private from the dev's lower neighbour
5475 * list. The caller must hold RCU read lock.
5476 */
5477void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5478{
5479        struct netdev_adjacent *lower;
5480
5481        lower = list_first_or_null_rcu(&dev->adj_list.lower,
5482                        struct netdev_adjacent, list);
5483        if (lower)
5484                return lower->private;
5485        return NULL;
5486}
5487EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5488
5489/**
5490 * netdev_master_upper_dev_get_rcu - Get master upper device
5491 * @dev: device
5492 *
5493 * Find a master upper device and return pointer to it or NULL in case
5494 * it's not there. The caller must hold the RCU read lock.
5495 */
5496struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5497{
5498        struct netdev_adjacent *upper;
5499
5500        upper = list_first_or_null_rcu(&dev->adj_list.upper,
5501                                       struct netdev_adjacent, list);
5502        if (upper && likely(upper->master))
5503                return upper->dev;
5504        return NULL;
5505}
5506EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5507
5508static int netdev_adjacent_sysfs_add(struct net_device *dev,
5509                              struct net_device *adj_dev,
5510                              struct list_head *dev_list)
5511{
5512        char linkname[IFNAMSIZ+7];
5513        sprintf(linkname, dev_list == &dev->adj_list.upper ?
5514                "upper_%s" : "lower_%s", adj_dev->name);
5515        return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5516                                 linkname);
5517}
5518static void netdev_adjacent_sysfs_del(struct net_device *dev,
5519                               char *name,
5520                               struct list_head *dev_list)
5521{
5522        char linkname[IFNAMSIZ+7];
5523        sprintf(linkname, dev_list == &dev->adj_list.upper ?
5524                "upper_%s" : "lower_%s", name);
5525        sysfs_remove_link(&(dev->dev.kobj), linkname);
5526}
5527
5528static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5529                                                 struct net_device *adj_dev,
5530                                                 struct list_head *dev_list)
5531{
5532        return (dev_list == &dev->adj_list.upper ||
5533                dev_list == &dev->adj_list.lower) &&
5534                net_eq(dev_net(dev), dev_net(adj_dev));
5535}
5536
5537static int __netdev_adjacent_dev_insert(struct net_device *dev,
5538                                        struct net_device *adj_dev,
5539                                        struct list_head *dev_list,
5540                                        void *private, bool master)
5541{
5542        struct netdev_adjacent *adj;
5543        int ret;
5544
5545        adj = __netdev_find_adj(adj_dev, dev_list);
5546
5547        if (adj) {
5548                adj->ref_nr++;
5549                return 0;
5550        }
5551
5552        adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5553        if (!adj)
5554                return -ENOMEM;
5555
5556        adj->dev = adj_dev;
5557        adj->master = master;
5558        adj->ref_nr = 1;
5559        adj->private = private;
5560        dev_hold(adj_dev);
5561
5562        pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5563                 adj_dev->name, dev->name, adj_dev->name);
5564
5565        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5566                ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5567                if (ret)
5568                        goto free_adj;
5569        }
5570
5571        /* Ensure that master link is always the first item in list. */
5572        if (master) {
5573                ret = sysfs_create_link(&(dev->dev.kobj),
5574                                        &(adj_dev->dev.kobj), "master");
5575                if (ret)
5576                        goto remove_symlinks;
5577
5578                list_add_rcu(&adj->list, dev_list);
5579        } else {
5580                list_add_tail_rcu(&adj->list, dev_list);
5581        }
5582
5583        return 0;
5584
5585remove_symlinks:
5586        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5587                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5588free_adj:
5589        kfree(adj);
5590        dev_put(adj_dev);
5591
5592        return ret;
5593}
5594
5595static void __netdev_adjacent_dev_remove(struct net_device *dev,
5596                                         struct net_device *adj_dev,
5597                                         struct list_head *dev_list)
5598{
5599        struct netdev_adjacent *adj;
5600
5601        adj = __netdev_find_adj(adj_dev, dev_list);
5602
5603        if (!adj) {
5604                pr_err("tried to remove device %s from %s\n",
5605                       dev->name, adj_dev->name);
5606                BUG();
5607        }
5608
5609        if (adj->ref_nr > 1) {
5610                pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5611                         adj->ref_nr-1);
5612                adj->ref_nr--;
5613                return;
5614        }
5615
5616        if (adj->master)
5617                sysfs_remove_link(&(dev->dev.kobj), "master");
5618
5619        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5620                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5621
5622        list_del_rcu(&adj->list);
5623        pr_debug("dev_put for %s, because link removed from %s to %s\n",
5624                 adj_dev->name, dev->name, adj_dev->name);
5625        dev_put(adj_dev);
5626        kfree_rcu(adj, rcu);
5627}
5628
5629static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5630                                            struct net_device *upper_dev,
5631                                            struct list_head *up_list,
5632                                            struct list_head *down_list,
5633                                            void *private, bool master)
5634{
5635        int ret;
5636
5637        ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5638                                           master);
5639        if (ret)
5640                return ret;
5641
5642        ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5643                                           false);
5644        if (ret) {
5645                __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5646                return ret;
5647        }
5648
5649        return 0;
5650}
5651
5652static int __netdev_adjacent_dev_link(struct net_device *dev,
5653                                      struct net_device *upper_dev)
5654{
5655        return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5656                                                &dev->all_adj_list.upper,
5657                                                &upper_dev->all_adj_list.lower,
5658                                                NULL, false);
5659}
5660
5661static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5662                                               struct net_device *upper_dev,
5663                                               struct list_head *up_list,
5664                                               struct list_head *down_list)
5665{
5666        __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5667        __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5668}
5669
5670static void __netdev_adjacent_dev_unlink(struct net_device *dev,