linux/drivers/net/vxlan.c
<<
>>
Prefs
   1/*
   2 * VXLAN: Virtual eXtensible Local Area Network
   3 *
   4 * Copyright (c) 2012 Vyatta Inc.
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License version 2 as
   8 * published by the Free Software Foundation.
   9 *
  10 * TODO
  11 *  - use IANA UDP port number (when defined)
  12 *  - IPv6 (not in RFC)
  13 */
  14
  15#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  16
  17#include <linux/kernel.h>
  18#include <linux/types.h>
  19#include <linux/module.h>
  20#include <linux/errno.h>
  21#include <linux/slab.h>
  22#include <linux/skbuff.h>
  23#include <linux/rculist.h>
  24#include <linux/netdevice.h>
  25#include <linux/in.h>
  26#include <linux/ip.h>
  27#include <linux/udp.h>
  28#include <linux/igmp.h>
  29#include <linux/etherdevice.h>
  30#include <linux/if_ether.h>
  31#include <linux/hash.h>
  32#include <net/arp.h>
  33#include <net/ndisc.h>
  34#include <net/ip.h>
  35#include <net/icmp.h>
  36#include <net/udp.h>
  37#include <net/rtnetlink.h>
  38#include <net/route.h>
  39#include <net/dsfield.h>
  40#include <net/inet_ecn.h>
  41#include <net/net_namespace.h>
  42#include <net/netns/generic.h>
  43
  44#define VXLAN_VERSION   "0.1"
  45
  46#define VNI_HASH_BITS   10
  47#define VNI_HASH_SIZE   (1<<VNI_HASH_BITS)
  48#define FDB_HASH_BITS   8
  49#define FDB_HASH_SIZE   (1<<FDB_HASH_BITS)
  50#define FDB_AGE_DEFAULT 300 /* 5 min */
  51#define FDB_AGE_INTERVAL (10 * HZ)      /* rescan interval */
  52
  53#define VXLAN_N_VID     (1u << 24)
  54#define VXLAN_VID_MASK  (VXLAN_N_VID - 1)
  55/* IP header + UDP + VXLAN + Ethernet header */
  56#define VXLAN_HEADROOM (20 + 8 + 8 + 14)
  57
  58#define VXLAN_FLAGS 0x08000000  /* struct vxlanhdr.vx_flags required value. */
  59
  60/* VXLAN protocol header */
  61struct vxlanhdr {
  62        __be32 vx_flags;
  63        __be32 vx_vni;
  64};
  65
  66/* UDP port for VXLAN traffic. */
  67static unsigned int vxlan_port __read_mostly = 8472;
  68module_param_named(udp_port, vxlan_port, uint, 0444);
  69MODULE_PARM_DESC(udp_port, "Destination UDP port");
  70
  71static bool log_ecn_error = true;
  72module_param(log_ecn_error, bool, 0644);
  73MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
  74
  75/* per-net private data for this module */
  76static unsigned int vxlan_net_id;
  77struct vxlan_net {
  78        struct socket     *sock;        /* UDP encap socket */
  79        struct hlist_head vni_list[VNI_HASH_SIZE];
  80};
  81
  82/* Forwarding table entry */
  83struct vxlan_fdb {
  84        struct hlist_node hlist;        /* linked list of entries */
  85        struct rcu_head   rcu;
  86        unsigned long     updated;      /* jiffies */
  87        unsigned long     used;
  88        __be32            remote_ip;
  89        u16               state;        /* see ndm_state */
  90        u8                eth_addr[ETH_ALEN];
  91};
  92
  93/* Per-cpu network traffic stats */
  94struct vxlan_stats {
  95        u64                     rx_packets;
  96        u64                     rx_bytes;
  97        u64                     tx_packets;
  98        u64                     tx_bytes;
  99        struct u64_stats_sync   syncp;
 100};
 101
 102/* Pseudo network device */
 103struct vxlan_dev {
 104        struct hlist_node hlist;
 105        struct net_device *dev;
 106        struct vxlan_stats __percpu *stats;
 107        __u32             vni;          /* virtual network id */
 108        __be32            gaddr;        /* multicast group */
 109        __be32            saddr;        /* source address */
 110        unsigned int      link;         /* link to multicast over */
 111        __u16             port_min;     /* source port range */
 112        __u16             port_max;
 113        __u8              tos;          /* TOS override */
 114        __u8              ttl;
 115        u32               flags;        /* VXLAN_F_* below */
 116
 117        unsigned long     age_interval;
 118        struct timer_list age_timer;
 119        spinlock_t        hash_lock;
 120        unsigned int      addrcnt;
 121        unsigned int      addrmax;
 122
 123        struct hlist_head fdb_head[FDB_HASH_SIZE];
 124};
 125
 126#define VXLAN_F_LEARN   0x01
 127#define VXLAN_F_PROXY   0x02
 128#define VXLAN_F_RSC     0x04
 129#define VXLAN_F_L2MISS  0x08
 130#define VXLAN_F_L3MISS  0x10
 131
 132/* salt for hash table */
 133static u32 vxlan_salt __read_mostly;
 134
 135static inline struct hlist_head *vni_head(struct net *net, u32 id)
 136{
 137        struct vxlan_net *vn = net_generic(net, vxlan_net_id);
 138
 139        return &vn->vni_list[hash_32(id, VNI_HASH_BITS)];
 140}
 141
 142/* Look up VNI in a per net namespace table */
 143static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id)
 144{
 145        struct vxlan_dev *vxlan;
 146        struct hlist_node *node;
 147
 148        hlist_for_each_entry_rcu(vxlan, node, vni_head(net, id), hlist) {
 149                if (vxlan->vni == id)
 150                        return vxlan;
 151        }
 152
 153        return NULL;
 154}
 155
 156/* Fill in neighbour message in skbuff. */
 157static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
 158                           const struct vxlan_fdb *fdb,
 159                           u32 portid, u32 seq, int type, unsigned int flags)
 160{
 161        unsigned long now = jiffies;
 162        struct nda_cacheinfo ci;
 163        struct nlmsghdr *nlh;
 164        struct ndmsg *ndm;
 165        bool send_ip, send_eth;
 166
 167        nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
 168        if (nlh == NULL)
 169                return -EMSGSIZE;
 170
 171        ndm = nlmsg_data(nlh);
 172        memset(ndm, 0, sizeof(*ndm));
 173
 174        send_eth = send_ip = true;
 175
 176        if (type == RTM_GETNEIGH) {
 177                ndm->ndm_family = AF_INET;
 178                send_ip = fdb->remote_ip != 0;
 179                send_eth = !is_zero_ether_addr(fdb->eth_addr);
 180        } else
 181                ndm->ndm_family = AF_BRIDGE;
 182        ndm->ndm_state = fdb->state;
 183        ndm->ndm_ifindex = vxlan->dev->ifindex;
 184        ndm->ndm_flags = NTF_SELF;
 185        ndm->ndm_type = NDA_DST;
 186
 187        if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
 188                goto nla_put_failure;
 189
 190        if (send_ip && nla_put_be32(skb, NDA_DST, fdb->remote_ip))
 191                goto nla_put_failure;
 192
 193        ci.ndm_used      = jiffies_to_clock_t(now - fdb->used);
 194        ci.ndm_confirmed = 0;
 195        ci.ndm_updated   = jiffies_to_clock_t(now - fdb->updated);
 196        ci.ndm_refcnt    = 0;
 197
 198        if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
 199                goto nla_put_failure;
 200
 201        return nlmsg_end(skb, nlh);
 202
 203nla_put_failure:
 204        nlmsg_cancel(skb, nlh);
 205        return -EMSGSIZE;
 206}
 207
 208static inline size_t vxlan_nlmsg_size(void)
 209{
 210        return NLMSG_ALIGN(sizeof(struct ndmsg))
 211                + nla_total_size(ETH_ALEN) /* NDA_LLADDR */
 212                + nla_total_size(sizeof(__be32)) /* NDA_DST */
 213                + nla_total_size(sizeof(struct nda_cacheinfo));
 214}
 215
 216static void vxlan_fdb_notify(struct vxlan_dev *vxlan,
 217                             const struct vxlan_fdb *fdb, int type)
 218{
 219        struct net *net = dev_net(vxlan->dev);
 220        struct sk_buff *skb;
 221        int err = -ENOBUFS;
 222
 223        skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC);
 224        if (skb == NULL)
 225                goto errout;
 226
 227        err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0);
 228        if (err < 0) {
 229                /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
 230                WARN_ON(err == -EMSGSIZE);
 231                kfree_skb(skb);
 232                goto errout;
 233        }
 234
 235        rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
 236        return;
 237errout:
 238        if (err < 0)
 239                rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
 240}
 241
 242static void vxlan_ip_miss(struct net_device *dev, __be32 ipa)
 243{
 244        struct vxlan_dev *vxlan = netdev_priv(dev);
 245        struct vxlan_fdb f;
 246
 247        memset(&f, 0, sizeof f);
 248        f.state = NUD_STALE;
 249        f.remote_ip = ipa; /* goes to NDA_DST */
 250
 251        vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH);
 252}
 253
 254static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN])
 255{
 256        struct vxlan_fdb        f;
 257
 258        memset(&f, 0, sizeof f);
 259        f.state = NUD_STALE;
 260        memcpy(f.eth_addr, eth_addr, ETH_ALEN);
 261
 262        vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH);
 263}
 264
 265/* Hash Ethernet address */
 266static u32 eth_hash(const unsigned char *addr)
 267{
 268        u64 value = get_unaligned((u64 *)addr);
 269
 270        /* only want 6 bytes */
 271#ifdef __BIG_ENDIAN
 272        value >>= 16;
 273#else
 274        value <<= 16;
 275#endif
 276        return hash_64(value, FDB_HASH_BITS);
 277}
 278
 279/* Hash chain to use given mac address */
 280static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
 281                                                const u8 *mac)
 282{
 283        return &vxlan->fdb_head[eth_hash(mac)];
 284}
 285
 286/* Look up Ethernet address in forwarding table */
 287static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
 288                                        const u8 *mac)
 289
 290{
 291        struct hlist_head *head = vxlan_fdb_head(vxlan, mac);
 292        struct vxlan_fdb *f;
 293        struct hlist_node *node;
 294
 295        hlist_for_each_entry_rcu(f, node, head, hlist) {
 296                if (compare_ether_addr(mac, f->eth_addr) == 0)
 297                        return f;
 298        }
 299
 300        return NULL;
 301}
 302
 303/* Add new entry to forwarding table -- assumes lock held */
 304static int vxlan_fdb_create(struct vxlan_dev *vxlan,
 305                            const u8 *mac, __be32 ip,
 306                            __u16 state, __u16 flags)
 307{
 308        struct vxlan_fdb *f;
 309        int notify = 0;
 310
 311        f = vxlan_find_mac(vxlan, mac);
 312        if (f) {
 313                if (flags & NLM_F_EXCL) {
 314                        netdev_dbg(vxlan->dev,
 315                                   "lost race to create %pM\n", mac);
 316                        return -EEXIST;
 317                }
 318                if (f->state != state) {
 319                        f->state = state;
 320                        f->updated = jiffies;
 321                        notify = 1;
 322                }
 323        } else {
 324                if (!(flags & NLM_F_CREATE))
 325                        return -ENOENT;
 326
 327                if (vxlan->addrmax && vxlan->addrcnt >= vxlan->addrmax)
 328                        return -ENOSPC;
 329
 330                netdev_dbg(vxlan->dev, "add %pM -> %pI4\n", mac, &ip);
 331                f = kmalloc(sizeof(*f), GFP_ATOMIC);
 332                if (!f)
 333                        return -ENOMEM;
 334
 335                notify = 1;
 336                f->remote_ip = ip;
 337                f->state = state;
 338                f->updated = f->used = jiffies;
 339                memcpy(f->eth_addr, mac, ETH_ALEN);
 340
 341                ++vxlan->addrcnt;
 342                hlist_add_head_rcu(&f->hlist,
 343                                   vxlan_fdb_head(vxlan, mac));
 344        }
 345
 346        if (notify)
 347                vxlan_fdb_notify(vxlan, f, RTM_NEWNEIGH);
 348
 349        return 0;
 350}
 351
 352static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f)
 353{
 354        netdev_dbg(vxlan->dev,
 355                    "delete %pM\n", f->eth_addr);
 356
 357        --vxlan->addrcnt;
 358        vxlan_fdb_notify(vxlan, f, RTM_DELNEIGH);
 359
 360        hlist_del_rcu(&f->hlist);
 361        kfree_rcu(f, rcu);
 362}
 363
 364/* Add static entry (via netlink) */
 365static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 366                         struct net_device *dev,
 367                         const unsigned char *addr, u16 flags)
 368{
 369        struct vxlan_dev *vxlan = netdev_priv(dev);
 370        __be32 ip;
 371        int err;
 372
 373        if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) {
 374                pr_info("RTM_NEWNEIGH with invalid state %#x\n",
 375                        ndm->ndm_state);
 376                return -EINVAL;
 377        }
 378
 379        if (tb[NDA_DST] == NULL)
 380                return -EINVAL;
 381
 382        if (nla_len(tb[NDA_DST]) != sizeof(__be32))
 383                return -EAFNOSUPPORT;
 384
 385        ip = nla_get_be32(tb[NDA_DST]);
 386
 387        spin_lock_bh(&vxlan->hash_lock);
 388        err = vxlan_fdb_create(vxlan, addr, ip, ndm->ndm_state, flags);
 389        spin_unlock_bh(&vxlan->hash_lock);
 390
 391        return err;
 392}
 393
 394/* Delete entry (via netlink) */
 395static int vxlan_fdb_delete(struct ndmsg *ndm, struct net_device *dev,
 396                            const unsigned char *addr)
 397{
 398        struct vxlan_dev *vxlan = netdev_priv(dev);
 399        struct vxlan_fdb *f;
 400        int err = -ENOENT;
 401
 402        spin_lock_bh(&vxlan->hash_lock);
 403        f = vxlan_find_mac(vxlan, addr);
 404        if (f) {
 405                vxlan_fdb_destroy(vxlan, f);
 406                err = 0;
 407        }
 408        spin_unlock_bh(&vxlan->hash_lock);
 409
 410        return err;
 411}
 412
 413/* Dump forwarding table */
 414static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
 415                          struct net_device *dev, int idx)
 416{
 417        struct vxlan_dev *vxlan = netdev_priv(dev);
 418        unsigned int h;
 419
 420        for (h = 0; h < FDB_HASH_SIZE; ++h) {
 421                struct vxlan_fdb *f;
 422                struct hlist_node *n;
 423                int err;
 424
 425                hlist_for_each_entry_rcu(f, n, &vxlan->fdb_head[h], hlist) {
 426                        if (idx < cb->args[0])
 427                                goto skip;
 428
 429                        err = vxlan_fdb_info(skb, vxlan, f,
 430                                             NETLINK_CB(cb->skb).portid,
 431                                             cb->nlh->nlmsg_seq,
 432                                             RTM_NEWNEIGH,
 433                                             NLM_F_MULTI);
 434                        if (err < 0)
 435                                break;
 436skip:
 437                        ++idx;
 438                }
 439        }
 440
 441        return idx;
 442}
 443
 444/* Watch incoming packets to learn mapping between Ethernet address
 445 * and Tunnel endpoint.
 446 */
 447static void vxlan_snoop(struct net_device *dev,
 448                        __be32 src_ip, const u8 *src_mac)
 449{
 450        struct vxlan_dev *vxlan = netdev_priv(dev);
 451        struct vxlan_fdb *f;
 452        int err;
 453
 454        f = vxlan_find_mac(vxlan, src_mac);
 455        if (likely(f)) {
 456                f->used = jiffies;
 457                if (likely(f->remote_ip == src_ip))
 458                        return;
 459
 460                if (net_ratelimit())
 461                        netdev_info(dev,
 462                                    "%pM migrated from %pI4 to %pI4\n",
 463                                    src_mac, &f->remote_ip, &src_ip);
 464
 465                f->remote_ip = src_ip;
 466                f->updated = jiffies;
 467        } else {
 468                /* learned new entry */
 469                spin_lock(&vxlan->hash_lock);
 470                err = vxlan_fdb_create(vxlan, src_mac, src_ip,
 471                                       NUD_REACHABLE,
 472                                       NLM_F_EXCL|NLM_F_CREATE);
 473                spin_unlock(&vxlan->hash_lock);
 474        }
 475}
 476
 477
 478/* See if multicast group is already in use by other ID */
 479static bool vxlan_group_used(struct vxlan_net *vn,
 480                             const struct vxlan_dev *this)
 481{
 482        const struct vxlan_dev *vxlan;
 483        struct hlist_node *node;
 484        unsigned h;
 485
 486        for (h = 0; h < VNI_HASH_SIZE; ++h)
 487                hlist_for_each_entry(vxlan, node, &vn->vni_list[h], hlist) {
 488                        if (vxlan == this)
 489                                continue;
 490
 491                        if (!netif_running(vxlan->dev))
 492                                continue;
 493
 494                        if (vxlan->gaddr == this->gaddr)
 495                                return true;
 496                }
 497
 498        return false;
 499}
 500
 501/* kernel equivalent to IP_ADD_MEMBERSHIP */
 502static int vxlan_join_group(struct net_device *dev)
 503{
 504        struct vxlan_dev *vxlan = netdev_priv(dev);
 505        struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
 506        struct sock *sk = vn->sock->sk;
 507        struct ip_mreqn mreq = {
 508                .imr_multiaddr.s_addr   = vxlan->gaddr,
 509                .imr_ifindex            = vxlan->link,
 510        };
 511        int err;
 512
 513        /* Already a member of group */
 514        if (vxlan_group_used(vn, vxlan))
 515                return 0;
 516
 517        /* Need to drop RTNL to call multicast join */
 518        rtnl_unlock();
 519        lock_sock(sk);
 520        err = ip_mc_join_group(sk, &mreq);
 521        release_sock(sk);
 522        rtnl_lock();
 523
 524        return err;
 525}
 526
 527
 528/* kernel equivalent to IP_DROP_MEMBERSHIP */
 529static int vxlan_leave_group(struct net_device *dev)
 530{
 531        struct vxlan_dev *vxlan = netdev_priv(dev);
 532        struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
 533        int err = 0;
 534        struct sock *sk = vn->sock->sk;
 535        struct ip_mreqn mreq = {
 536                .imr_multiaddr.s_addr   = vxlan->gaddr,
 537                .imr_ifindex            = vxlan->link,
 538        };
 539
 540        /* Only leave group when last vxlan is done. */
 541        if (vxlan_group_used(vn, vxlan))
 542                return 0;
 543
 544        /* Need to drop RTNL to call multicast leave */
 545        rtnl_unlock();
 546        lock_sock(sk);
 547        err = ip_mc_leave_group(sk, &mreq);
 548        release_sock(sk);
 549        rtnl_lock();
 550
 551        return err;
 552}
 553
 554/* Callback from net/ipv4/udp.c to receive packets */
 555static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 556{
 557        struct iphdr *oip;
 558        struct vxlanhdr *vxh;
 559        struct vxlan_dev *vxlan;
 560        struct vxlan_stats *stats;
 561        __u32 vni;
 562        int err;
 563
 564        /* pop off outer UDP header */
 565        __skb_pull(skb, sizeof(struct udphdr));
 566
 567        /* Need Vxlan and inner Ethernet header to be present */
 568        if (!pskb_may_pull(skb, sizeof(struct vxlanhdr)))
 569                goto error;
 570
 571        /* Drop packets with reserved bits set */
 572        vxh = (struct vxlanhdr *) skb->data;
 573        if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
 574            (vxh->vx_vni & htonl(0xff))) {
 575                netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
 576                           ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));
 577                goto error;
 578        }
 579
 580        __skb_pull(skb, sizeof(struct vxlanhdr));
 581
 582        /* Is this VNI defined? */
 583        vni = ntohl(vxh->vx_vni) >> 8;
 584        vxlan = vxlan_find_vni(sock_net(sk), vni);
 585        if (!vxlan) {
 586                netdev_dbg(skb->dev, "unknown vni %d\n", vni);
 587                goto drop;
 588        }
 589
 590        if (!pskb_may_pull(skb, ETH_HLEN)) {
 591                vxlan->dev->stats.rx_length_errors++;
 592                vxlan->dev->stats.rx_errors++;
 593                goto drop;
 594        }
 595
 596        skb_reset_mac_header(skb);
 597
 598        /* Re-examine inner Ethernet packet */
 599        oip = ip_hdr(skb);
 600        skb->protocol = eth_type_trans(skb, vxlan->dev);
 601
 602        /* Ignore packet loops (and multicast echo) */
 603        if (compare_ether_addr(eth_hdr(skb)->h_source,
 604                               vxlan->dev->dev_addr) == 0)
 605                goto drop;
 606
 607        if (vxlan->flags & VXLAN_F_LEARN)
 608                vxlan_snoop(skb->dev, oip->saddr, eth_hdr(skb)->h_source);
 609
 610        __skb_tunnel_rx(skb, vxlan->dev);
 611        skb_reset_network_header(skb);
 612
 613        /* If the NIC driver gave us an encapsulated packet with
 614         * CHECKSUM_UNNECESSARY and Rx checksum feature is enabled,
 615         * leave the CHECKSUM_UNNECESSARY, the device checksummed it
 616         * for us. Otherwise force the upper layers to verify it.
 617         */
 618        if (skb->ip_summed != CHECKSUM_UNNECESSARY || !skb->encapsulation ||
 619            !(vxlan->dev->features & NETIF_F_RXCSUM))
 620                skb->ip_summed = CHECKSUM_NONE;
 621
 622        skb->encapsulation = 0;
 623
 624        err = IP_ECN_decapsulate(oip, skb);
 625        if (unlikely(err)) {
 626                if (log_ecn_error)
 627                        net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
 628                                             &oip->saddr, oip->tos);
 629                if (err > 1) {
 630                        ++vxlan->dev->stats.rx_frame_errors;
 631                        ++vxlan->dev->stats.rx_errors;
 632                        goto drop;
 633                }
 634        }
 635
 636        stats = this_cpu_ptr(vxlan->stats);
 637        u64_stats_update_begin(&stats->syncp);
 638        stats->rx_packets++;
 639        stats->rx_bytes += skb->len;
 640        u64_stats_update_end(&stats->syncp);
 641
 642        netif_rx(skb);
 643
 644        return 0;
 645error:
 646        /* Put UDP header back */
 647        __skb_push(skb, sizeof(struct udphdr));
 648
 649        return 1;
 650drop:
 651        /* Consume bad packet */
 652        kfree_skb(skb);
 653        return 0;
 654}
 655
 656static int arp_reduce(struct net_device *dev, struct sk_buff *skb)
 657{
 658        struct vxlan_dev *vxlan = netdev_priv(dev);
 659        struct arphdr *parp;
 660        u8 *arpptr, *sha;
 661        __be32 sip, tip;
 662        struct neighbour *n;
 663
 664        if (dev->flags & IFF_NOARP)
 665                goto out;
 666
 667        if (!pskb_may_pull(skb, arp_hdr_len(dev))) {
 668                dev->stats.tx_dropped++;
 669                goto out;
 670        }
 671        parp = arp_hdr(skb);
 672
 673        if ((parp->ar_hrd != htons(ARPHRD_ETHER) &&
 674             parp->ar_hrd != htons(ARPHRD_IEEE802)) ||
 675            parp->ar_pro != htons(ETH_P_IP) ||
 676            parp->ar_op != htons(ARPOP_REQUEST) ||
 677            parp->ar_hln != dev->addr_len ||
 678            parp->ar_pln != 4)
 679                goto out;
 680        arpptr = (u8 *)parp + sizeof(struct arphdr);
 681        sha = arpptr;
 682        arpptr += dev->addr_len;        /* sha */
 683        memcpy(&sip, arpptr, sizeof(sip));
 684        arpptr += sizeof(sip);
 685        arpptr += dev->addr_len;        /* tha */
 686        memcpy(&tip, arpptr, sizeof(tip));
 687
 688        if (ipv4_is_loopback(tip) ||
 689            ipv4_is_multicast(tip))
 690                goto out;
 691
 692        n = neigh_lookup(&arp_tbl, &tip, dev);
 693
 694        if (n) {
 695                struct vxlan_dev *vxlan = netdev_priv(dev);
 696                struct vxlan_fdb *f;
 697                struct sk_buff  *reply;
 698
 699                if (!(n->nud_state & NUD_CONNECTED)) {
 700                        neigh_release(n);
 701                        goto out;
 702                }
 703
 704                f = vxlan_find_mac(vxlan, n->ha);
 705                if (f && f->remote_ip == 0) {
 706                        /* bridge-local neighbor */
 707                        neigh_release(n);
 708                        goto out;
 709                }
 710
 711                reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
 712                                n->ha, sha);
 713
 714                neigh_release(n);
 715
 716                skb_reset_mac_header(reply);
 717                __skb_pull(reply, skb_network_offset(reply));
 718                reply->ip_summed = CHECKSUM_UNNECESSARY;
 719                reply->pkt_type = PACKET_HOST;
 720
 721                if (netif_rx_ni(reply) == NET_RX_DROP)
 722                        dev->stats.rx_dropped++;
 723        } else if (vxlan->flags & VXLAN_F_L3MISS)
 724                vxlan_ip_miss(dev, tip);
 725out:
 726        consume_skb(skb);
 727        return NETDEV_TX_OK;
 728}
 729
 730static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
 731{
 732        struct vxlan_dev *vxlan = netdev_priv(dev);
 733        struct neighbour *n;
 734        struct iphdr *pip;
 735
 736        if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
 737                return false;
 738
 739        n = NULL;
 740        switch (ntohs(eth_hdr(skb)->h_proto)) {
 741        case ETH_P_IP:
 742                if (!pskb_may_pull(skb, sizeof(struct iphdr)))
 743                        return false;
 744                pip = ip_hdr(skb);
 745                n = neigh_lookup(&arp_tbl, &pip->daddr, dev);
 746                break;
 747        default:
 748                return false;
 749        }
 750
 751        if (n) {
 752                bool diff;
 753
 754                diff = compare_ether_addr(eth_hdr(skb)->h_dest, n->ha) != 0;
 755                if (diff) {
 756                        memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
 757                                dev->addr_len);
 758                        memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len);
 759                }
 760                neigh_release(n);
 761                return diff;
 762        } else if (vxlan->flags & VXLAN_F_L3MISS)
 763                vxlan_ip_miss(dev, pip->daddr);
 764        return false;
 765}
 766
 767/* Extract dsfield from inner protocol */
 768static inline u8 vxlan_get_dsfield(const struct iphdr *iph,
 769                                   const struct sk_buff *skb)
 770{
 771        if (skb->protocol == htons(ETH_P_IP))
 772                return iph->tos;
 773        else if (skb->protocol == htons(ETH_P_IPV6))
 774                return ipv6_get_dsfield((const struct ipv6hdr *)iph);
 775        else
 776                return 0;
 777}
 778
 779/* Propogate ECN bits out */
 780static inline u8 vxlan_ecn_encap(u8 tos,
 781                                 const struct iphdr *iph,
 782                                 const struct sk_buff *skb)
 783{
 784        u8 inner = vxlan_get_dsfield(iph, skb);
 785
 786        return INET_ECN_encapsulate(tos, inner);
 787}
 788
 789static void vxlan_sock_free(struct sk_buff *skb)
 790{
 791        sock_put(skb->sk);
 792}
 793
 794/* On transmit, associate with the tunnel socket */
 795static void vxlan_set_owner(struct net_device *dev, struct sk_buff *skb)
 796{
 797        struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
 798        struct sock *sk = vn->sock->sk;
 799
 800        skb_orphan(skb);
 801        sock_hold(sk);
 802        skb->sk = sk;
 803        skb->destructor = vxlan_sock_free;
 804}
 805
 806/* Compute source port for outgoing packet
 807 *   first choice to use L4 flow hash since it will spread
 808 *     better and maybe available from hardware
 809 *   secondary choice is to use jhash on the Ethernet header
 810 */
 811static u16 vxlan_src_port(const struct vxlan_dev *vxlan, struct sk_buff *skb)
 812{
 813        unsigned int range = (vxlan->port_max - vxlan->port_min) + 1;
 814        u32 hash;
 815
 816        hash = skb_get_rxhash(skb);
 817        if (!hash)
 818                hash = jhash(skb->data, 2 * ETH_ALEN,
 819                             (__force u32) skb->protocol);
 820
 821        return (((u64) hash * range) >> 32) + vxlan->port_min;
 822}
 823
 824/* Transmit local packets over Vxlan
 825 *
 826 * Outer IP header inherits ECN and DF from inner header.
 827 * Outer UDP destination is the VXLAN assigned port.
 828 *           source port is based on hash of flow
 829 */
 830static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 831{
 832        struct vxlan_dev *vxlan = netdev_priv(dev);
 833        struct rtable *rt;
 834        const struct iphdr *old_iph;
 835        struct ethhdr *eth;
 836        struct iphdr *iph;
 837        struct vxlanhdr *vxh;
 838        struct udphdr *uh;
 839        struct flowi4 fl4;
 840        unsigned int pkt_len = skb->len;
 841        __be32 dst;
 842        __u16 src_port;
 843        __be16 df = 0;
 844        __u8 tos, ttl;
 845        int err;
 846        bool did_rsc = false;
 847        const struct vxlan_fdb *f;
 848
 849        skb_reset_mac_header(skb);
 850        eth = eth_hdr(skb);
 851
 852        if ((vxlan->flags & VXLAN_F_PROXY) && ntohs(eth->h_proto) == ETH_P_ARP)
 853                return arp_reduce(dev, skb);
 854        else if ((vxlan->flags&VXLAN_F_RSC) && ntohs(eth->h_proto) == ETH_P_IP)
 855                did_rsc = route_shortcircuit(dev, skb);
 856
 857        f = vxlan_find_mac(vxlan, eth->h_dest);
 858        if (f == NULL) {
 859                did_rsc = false;
 860                dst = vxlan->gaddr;
 861                if (!dst && (vxlan->flags & VXLAN_F_L2MISS) &&
 862                    !is_multicast_ether_addr(eth->h_dest))
 863                        vxlan_fdb_miss(vxlan, eth->h_dest);
 864        } else
 865                dst = f->remote_ip;
 866
 867        if (!dst) {
 868                if (did_rsc) {
 869                        __skb_pull(skb, skb_network_offset(skb));
 870                        skb->ip_summed = CHECKSUM_NONE;
 871                        skb->pkt_type = PACKET_HOST;
 872
 873                        /* short-circuited back to local bridge */
 874                        if (netif_rx(skb) == NET_RX_SUCCESS) {
 875                                struct vxlan_stats *stats =
 876                                                this_cpu_ptr(vxlan->stats);
 877
 878                                u64_stats_update_begin(&stats->syncp);
 879                                stats->tx_packets++;
 880                                stats->tx_bytes += pkt_len;
 881                                u64_stats_update_end(&stats->syncp);
 882                        } else {
 883                                dev->stats.tx_errors++;
 884                                dev->stats.tx_aborted_errors++;
 885                        }
 886                        return NETDEV_TX_OK;
 887                }
 888                goto drop;
 889        }
 890
 891        if (!skb->encapsulation) {
 892                skb_reset_inner_headers(skb);
 893                skb->encapsulation = 1;
 894        }
 895
 896        /* Need space for new headers (invalidates iph ptr) */
 897        if (skb_cow_head(skb, VXLAN_HEADROOM))
 898                goto drop;
 899
 900        old_iph = ip_hdr(skb);
 901
 902        ttl = vxlan->ttl;
 903        if (!ttl && IN_MULTICAST(ntohl(dst)))
 904                ttl = 1;
 905
 906        tos = vxlan->tos;
 907        if (tos == 1)
 908                tos = vxlan_get_dsfield(old_iph, skb);
 909
 910        src_port = vxlan_src_port(vxlan, skb);
 911
 912        memset(&fl4, 0, sizeof(fl4));
 913        fl4.flowi4_oif = vxlan->link;
 914        fl4.flowi4_tos = RT_TOS(tos);
 915        fl4.daddr = dst;
 916        fl4.saddr = vxlan->saddr;
 917
 918        rt = ip_route_output_key(dev_net(dev), &fl4);
 919        if (IS_ERR(rt)) {
 920                netdev_dbg(dev, "no route to %pI4\n", &dst);
 921                dev->stats.tx_carrier_errors++;
 922                goto tx_error;
 923        }
 924
 925        if (rt->dst.dev == dev) {
 926                netdev_dbg(dev, "circular route to %pI4\n", &dst);
 927                ip_rt_put(rt);
 928                dev->stats.collisions++;
 929                goto tx_error;
 930        }
 931
 932        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 933        IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
 934                              IPSKB_REROUTED);
 935        skb_dst_drop(skb);
 936        skb_dst_set(skb, &rt->dst);
 937
 938        vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
 939        vxh->vx_flags = htonl(VXLAN_FLAGS);
 940        vxh->vx_vni = htonl(vxlan->vni << 8);
 941
 942        __skb_push(skb, sizeof(*uh));
 943        skb_reset_transport_header(skb);
 944        uh = udp_hdr(skb);
 945
 946        uh->dest = htons(vxlan_port);
 947        uh->source = htons(src_port);
 948
 949        uh->len = htons(skb->len);
 950        uh->check = 0;
 951
 952        __skb_push(skb, sizeof(*iph));
 953        skb_reset_network_header(skb);
 954        iph             = ip_hdr(skb);
 955        iph->version    = 4;
 956        iph->ihl        = sizeof(struct iphdr) >> 2;
 957        iph->frag_off   = df;
 958        iph->protocol   = IPPROTO_UDP;
 959        iph->tos        = vxlan_ecn_encap(tos, old_iph, skb);
 960        iph->daddr      = dst;
 961        iph->saddr      = fl4.saddr;
 962        iph->ttl        = ttl ? : ip4_dst_hoplimit(&rt->dst);
 963
 964        vxlan_set_owner(dev, skb);
 965
 966        /* See iptunnel_xmit() */
 967        if (skb->ip_summed != CHECKSUM_PARTIAL)
 968                skb->ip_summed = CHECKSUM_NONE;
 969        ip_select_ident(iph, &rt->dst, NULL);
 970
 971        err = ip_local_out(skb);
 972        if (likely(net_xmit_eval(err) == 0)) {
 973                struct vxlan_stats *stats = this_cpu_ptr(vxlan->stats);
 974
 975                u64_stats_update_begin(&stats->syncp);
 976                stats->tx_packets++;
 977                stats->tx_bytes += pkt_len;
 978                u64_stats_update_end(&stats->syncp);
 979        } else {
 980                dev->stats.tx_errors++;
 981                dev->stats.tx_aborted_errors++;
 982        }
 983        return NETDEV_TX_OK;
 984
 985drop:
 986        dev->stats.tx_dropped++;
 987        goto tx_free;
 988
 989tx_error:
 990        dev->stats.tx_errors++;
 991tx_free:
 992        dev_kfree_skb(skb);
 993        return NETDEV_TX_OK;
 994}
 995
 996/* Walk the forwarding table and purge stale entries */
 997static void vxlan_cleanup(unsigned long arg)
 998{
 999        struct vxlan_dev *vxlan = (struct vxlan_dev *) arg;
1000        unsigned long next_timer = jiffies + FDB_AGE_INTERVAL;
1001        unsigned int h;
1002
1003        if (!netif_running(vxlan->dev))
1004                return;
1005
1006        spin_lock_bh(&vxlan->hash_lock);
1007        for (h = 0; h < FDB_HASH_SIZE; ++h) {
1008                struct hlist_node *p, *n;
1009                hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
1010                        struct vxlan_fdb *f
1011                                = container_of(p, struct vxlan_fdb, hlist);
1012                        unsigned long timeout;
1013
1014                        if (f->state & NUD_PERMANENT)
1015                                continue;
1016
1017                        timeout = f->used + vxlan->age_interval * HZ;
1018                        if (time_before_eq(timeout, jiffies)) {
1019                                netdev_dbg(vxlan->dev,
1020                                           "garbage collect %pM\n",
1021                                           f->eth_addr);
1022                                f->state = NUD_STALE;
1023                                vxlan_fdb_destroy(vxlan, f);
1024                        } else if (time_before(timeout, next_timer))
1025                                next_timer = timeout;
1026                }
1027        }
1028        spin_unlock_bh(&vxlan->hash_lock);
1029
1030        mod_timer(&vxlan->age_timer, next_timer);
1031}
1032
1033/* Setup stats when device is created */
1034static int vxlan_init(struct net_device *dev)
1035{
1036        struct vxlan_dev *vxlan = netdev_priv(dev);
1037
1038        vxlan->stats = alloc_percpu(struct vxlan_stats);
1039        if (!vxlan->stats)
1040                return -ENOMEM;
1041
1042        return 0;
1043}
1044
1045/* Start ageing timer and join group when device is brought up */
1046static int vxlan_open(struct net_device *dev)
1047{
1048        struct vxlan_dev *vxlan = netdev_priv(dev);
1049        int err;
1050
1051        if (vxlan->gaddr) {
1052                err = vxlan_join_group(dev);
1053                if (err)
1054                        return err;
1055        }
1056
1057        if (vxlan->age_interval)
1058                mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);
1059
1060        return 0;
1061}
1062
1063/* Purge the forwarding table */
1064static void vxlan_flush(struct vxlan_dev *vxlan)
1065{
1066        unsigned h;
1067
1068        spin_lock_bh(&vxlan->hash_lock);
1069        for (h = 0; h < FDB_HASH_SIZE; ++h) {
1070                struct hlist_node *p, *n;
1071                hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
1072                        struct vxlan_fdb *f
1073                                = container_of(p, struct vxlan_fdb, hlist);
1074                        vxlan_fdb_destroy(vxlan, f);
1075                }
1076        }
1077        spin_unlock_bh(&vxlan->hash_lock);
1078}
1079
1080/* Cleanup timer and forwarding table on shutdown */
1081static int vxlan_stop(struct net_device *dev)
1082{
1083        struct vxlan_dev *vxlan = netdev_priv(dev);
1084
1085        if (vxlan->gaddr)
1086                vxlan_leave_group(dev);
1087
1088        del_timer_sync(&vxlan->age_timer);
1089
1090        vxlan_flush(vxlan);
1091
1092        return 0;
1093}
1094
1095/* Merge per-cpu statistics */
1096static struct rtnl_link_stats64 *vxlan_stats64(struct net_device *dev,
1097                                               struct rtnl_link_stats64 *stats)
1098{
1099        struct vxlan_dev *vxlan = netdev_priv(dev);
1100        struct vxlan_stats tmp, sum = { 0 };
1101        unsigned int cpu;
1102
1103        for_each_possible_cpu(cpu) {
1104                unsigned int start;
1105                const struct vxlan_stats *stats
1106                        = per_cpu_ptr(vxlan->stats, cpu);
1107
1108                do {
1109                        start = u64_stats_fetch_begin_bh(&stats->syncp);
1110                        memcpy(&tmp, stats, sizeof(tmp));
1111                } while (u64_stats_fetch_retry_bh(&stats->syncp, start));
1112
1113                sum.tx_bytes   += tmp.tx_bytes;
1114                sum.tx_packets += tmp.tx_packets;
1115                sum.rx_bytes   += tmp.rx_bytes;
1116                sum.rx_packets += tmp.rx_packets;
1117        }
1118
1119        stats->tx_bytes   = sum.tx_bytes;
1120        stats->tx_packets = sum.tx_packets;
1121        stats->rx_bytes   = sum.rx_bytes;
1122        stats->rx_packets = sum.rx_packets;
1123
1124        stats->multicast = dev->stats.multicast;
1125        stats->rx_length_errors = dev->stats.rx_length_errors;
1126        stats->rx_frame_errors = dev->stats.rx_frame_errors;
1127        stats->rx_errors = dev->stats.rx_errors;
1128
1129        stats->tx_dropped = dev->stats.tx_dropped;
1130        stats->tx_carrier_errors  = dev->stats.tx_carrier_errors;
1131        stats->tx_aborted_errors  = dev->stats.tx_aborted_errors;
1132        stats->collisions  = dev->stats.collisions;
1133        stats->tx_errors = dev->stats.tx_errors;
1134
1135        return stats;
1136}
1137
1138/* Stub, nothing needs to be done. */
1139static void vxlan_set_multicast_list(struct net_device *dev)
1140{
1141}
1142
1143static const struct net_device_ops vxlan_netdev_ops = {
1144        .ndo_init               = vxlan_init,
1145        .ndo_open               = vxlan_open,
1146        .ndo_stop               = vxlan_stop,
1147        .ndo_start_xmit         = vxlan_xmit,
1148        .ndo_get_stats64        = vxlan_stats64,
1149        .ndo_set_rx_mode        = vxlan_set_multicast_list,
1150        .ndo_change_mtu         = eth_change_mtu,
1151        .ndo_validate_addr      = eth_validate_addr,
1152        .ndo_set_mac_address    = eth_mac_addr,
1153        .ndo_fdb_add            = vxlan_fdb_add,
1154        .ndo_fdb_del            = vxlan_fdb_delete,
1155        .ndo_fdb_dump           = vxlan_fdb_dump,
1156};
1157
1158/* Info for udev, that this is a virtual tunnel endpoint */
1159static struct device_type vxlan_type = {
1160        .name = "vxlan",
1161};
1162
1163static void vxlan_free(struct net_device *dev)
1164{
1165        struct vxlan_dev *vxlan = netdev_priv(dev);
1166
1167        free_percpu(vxlan->stats);
1168        free_netdev(dev);
1169}
1170
1171/* Initialize the device structure. */
1172static void vxlan_setup(struct net_device *dev)
1173{
1174        struct vxlan_dev *vxlan = netdev_priv(dev);
1175        unsigned h;
1176        int low, high;
1177
1178        eth_hw_addr_random(dev);
1179        ether_setup(dev);
1180        dev->hard_header_len = ETH_HLEN + VXLAN_HEADROOM;
1181
1182        dev->netdev_ops = &vxlan_netdev_ops;
1183        dev->destructor = vxlan_free;
1184        SET_NETDEV_DEVTYPE(dev, &vxlan_type);
1185
1186        dev->tx_queue_len = 0;
1187        dev->features   |= NETIF_F_LLTX;
1188        dev->features   |= NETIF_F_NETNS_LOCAL;
1189        dev->features   |= NETIF_F_SG | NETIF_F_HW_CSUM;
1190        dev->features   |= NETIF_F_RXCSUM;
1191
1192        dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
1193        dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
1194        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1195
1196        spin_lock_init(&vxlan->hash_lock);
1197
1198        init_timer_deferrable(&vxlan->age_timer);
1199        vxlan->age_timer.function = vxlan_cleanup;
1200        vxlan->age_timer.data = (unsigned long) vxlan;
1201
1202        inet_get_local_port_range(&low, &high);
1203        vxlan->port_min = low;
1204        vxlan->port_max = high;
1205
1206        vxlan->dev = dev;
1207
1208        for (h = 0; h < FDB_HASH_SIZE; ++h)
1209                INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
1210}
1211
1212static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
1213        [IFLA_VXLAN_ID]         = { .type = NLA_U32 },
1214        [IFLA_VXLAN_GROUP]      = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1215        [IFLA_VXLAN_LINK]       = { .type = NLA_U32 },
1216        [IFLA_VXLAN_LOCAL]      = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1217        [IFLA_VXLAN_TOS]        = { .type = NLA_U8 },
1218        [IFLA_VXLAN_TTL]        = { .type = NLA_U8 },
1219        [IFLA_VXLAN_LEARNING]   = { .type = NLA_U8 },
1220        [IFLA_VXLAN_AGEING]     = { .type = NLA_U32 },
1221        [IFLA_VXLAN_LIMIT]      = { .type = NLA_U32 },
1222        [IFLA_VXLAN_PORT_RANGE] = { .len  = sizeof(struct ifla_vxlan_port_range) },
1223        [IFLA_VXLAN_PROXY]      = { .type = NLA_U8 },
1224        [IFLA_VXLAN_RSC]        = { .type = NLA_U8 },
1225        [IFLA_VXLAN_L2MISS]     = { .type = NLA_U8 },
1226        [IFLA_VXLAN_L3MISS]     = { .type = NLA_U8 },
1227};
1228
1229static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
1230{
1231        if (tb[IFLA_ADDRESS]) {
1232                if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
1233                        pr_debug("invalid link address (not ethernet)\n");
1234                        return -EINVAL;
1235                }
1236
1237                if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
1238                        pr_debug("invalid all zero ethernet address\n");
1239                        return -EADDRNOTAVAIL;
1240                }
1241        }
1242
1243        if (!data)
1244                return -EINVAL;
1245
1246        if (data[IFLA_VXLAN_ID]) {
1247                __u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
1248                if (id >= VXLAN_VID_MASK)
1249                        return -ERANGE;
1250        }
1251
1252        if (data[IFLA_VXLAN_GROUP]) {
1253                __be32 gaddr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
1254                if (!IN_MULTICAST(ntohl(gaddr))) {
1255                        pr_debug("group address is not IPv4 multicast\n");
1256                        return -EADDRNOTAVAIL;
1257                }
1258        }
1259
1260        if (data[IFLA_VXLAN_PORT_RANGE]) {
1261                const struct ifla_vxlan_port_range *p
1262                        = nla_data(data[IFLA_VXLAN_PORT_RANGE]);
1263
1264                if (ntohs(p->high) < ntohs(p->low)) {
1265                        pr_debug("port range %u .. %u not valid\n",
1266                                 ntohs(p->low), ntohs(p->high));
1267                        return -EINVAL;
1268                }
1269        }
1270
1271        return 0;
1272}
1273
1274static int vxlan_newlink(struct net *net, struct net_device *dev,
1275                         struct nlattr *tb[], struct nlattr *data[])
1276{
1277        struct vxlan_dev *vxlan = netdev_priv(dev);
1278        __u32 vni;
1279        int err;
1280
1281        if (!data[IFLA_VXLAN_ID])
1282                return -EINVAL;
1283
1284        vni = nla_get_u32(data[IFLA_VXLAN_ID]);
1285        if (vxlan_find_vni(net, vni)) {
1286                pr_info("duplicate VNI %u\n", vni);
1287                return -EEXIST;
1288        }
1289        vxlan->vni = vni;
1290
1291        if (data[IFLA_VXLAN_GROUP])
1292                vxlan->gaddr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
1293
1294        if (data[IFLA_VXLAN_LOCAL])
1295                vxlan->saddr = nla_get_be32(data[IFLA_VXLAN_LOCAL]);
1296
1297        if (data[IFLA_VXLAN_LINK] &&
1298            (vxlan->link = nla_get_u32(data[IFLA_VXLAN_LINK]))) {
1299                struct net_device *lowerdev
1300                         = __dev_get_by_index(net, vxlan->link);
1301
1302                if (!lowerdev) {
1303                        pr_info("ifindex %d does not exist\n", vxlan->link);
1304                        return -ENODEV;
1305                }
1306
1307                if (!tb[IFLA_MTU])
1308                        dev->mtu = lowerdev->mtu - VXLAN_HEADROOM;
1309
1310                /* update header length based on lower device */
1311                dev->hard_header_len = lowerdev->hard_header_len +
1312                                       VXLAN_HEADROOM;
1313        }
1314
1315        if (data[IFLA_VXLAN_TOS])
1316                vxlan->tos  = nla_get_u8(data[IFLA_VXLAN_TOS]);
1317
1318        if (data[IFLA_VXLAN_TTL])
1319                vxlan->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
1320
1321        if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING]))
1322                vxlan->flags |= VXLAN_F_LEARN;
1323
1324        if (data[IFLA_VXLAN_AGEING])
1325                vxlan->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]);
1326        else
1327                vxlan->age_interval = FDB_AGE_DEFAULT;
1328
1329        if (data[IFLA_VXLAN_PROXY] && nla_get_u8(data[IFLA_VXLAN_PROXY]))
1330                vxlan->flags |= VXLAN_F_PROXY;
1331
1332        if (data[IFLA_VXLAN_RSC] && nla_get_u8(data[IFLA_VXLAN_RSC]))
1333                vxlan->flags |= VXLAN_F_RSC;
1334
1335        if (data[IFLA_VXLAN_L2MISS] && nla_get_u8(data[IFLA_VXLAN_L2MISS]))
1336                vxlan->flags |= VXLAN_F_L2MISS;
1337
1338        if (data[IFLA_VXLAN_L3MISS] && nla_get_u8(data[IFLA_VXLAN_L3MISS]))
1339                vxlan->flags |= VXLAN_F_L3MISS;
1340
1341        if (data[IFLA_VXLAN_LIMIT])
1342                vxlan->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
1343
1344        if (data[IFLA_VXLAN_PORT_RANGE]) {
1345                const struct ifla_vxlan_port_range *p
1346                        = nla_data(data[IFLA_VXLAN_PORT_RANGE]);
1347                vxlan->port_min = ntohs(p->low);
1348                vxlan->port_max = ntohs(p->high);
1349        }
1350
1351        err = register_netdevice(dev);
1352        if (!err)
1353                hlist_add_head_rcu(&vxlan->hlist, vni_head(net, vxlan->vni));
1354
1355        return err;
1356}
1357
1358static void vxlan_dellink(struct net_device *dev, struct list_head *head)
1359{
1360        struct vxlan_dev *vxlan = netdev_priv(dev);
1361
1362        hlist_del_rcu(&vxlan->hlist);
1363
1364        unregister_netdevice_queue(dev, head);
1365}
1366
1367static size_t vxlan_get_size(const struct net_device *dev)
1368{
1369
1370        return nla_total_size(sizeof(__u32)) +  /* IFLA_VXLAN_ID */
1371                nla_total_size(sizeof(__be32)) +/* IFLA_VXLAN_GROUP */
1372                nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */
1373                nla_total_size(sizeof(__be32))+ /* IFLA_VXLAN_LOCAL */
1374                nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TTL */
1375                nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TOS */
1376                nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_LEARNING */
1377                nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_PROXY */
1378                nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_RSC */
1379                nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_L2MISS */
1380                nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_L3MISS */
1381                nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */
1382                nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */
1383                nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
1384                0;
1385}
1386
1387static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
1388{
1389        const struct vxlan_dev *vxlan = netdev_priv(dev);
1390        struct ifla_vxlan_port_range ports = {
1391                .low =  htons(vxlan->port_min),
1392                .high = htons(vxlan->port_max),
1393        };
1394
1395        if (nla_put_u32(skb, IFLA_VXLAN_ID, vxlan->vni))
1396                goto nla_put_failure;
1397
1398        if (vxlan->gaddr && nla_put_be32(skb, IFLA_VXLAN_GROUP, vxlan->gaddr))
1399                goto nla_put_failure;
1400
1401        if (vxlan->link && nla_put_u32(skb, IFLA_VXLAN_LINK, vxlan->link))
1402                goto nla_put_failure;
1403
1404        if (vxlan->saddr && nla_put_be32(skb, IFLA_VXLAN_LOCAL, vxlan->saddr))
1405                goto nla_put_failure;
1406
1407        if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) ||
1408            nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) ||
1409            nla_put_u8(skb, IFLA_VXLAN_LEARNING,
1410                        !!(vxlan->flags & VXLAN_F_LEARN)) ||
1411            nla_put_u8(skb, IFLA_VXLAN_PROXY,
1412                        !!(vxlan->flags & VXLAN_F_PROXY)) ||
1413            nla_put_u8(skb, IFLA_VXLAN_RSC, !!(vxlan->flags & VXLAN_F_RSC)) ||
1414            nla_put_u8(skb, IFLA_VXLAN_L2MISS,
1415                        !!(vxlan->flags & VXLAN_F_L2MISS)) ||
1416            nla_put_u8(skb, IFLA_VXLAN_L3MISS,
1417                        !!(vxlan->flags & VXLAN_F_L3MISS)) ||
1418            nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->age_interval) ||
1419            nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->addrmax))
1420                goto nla_put_failure;
1421
1422        if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports))
1423                goto nla_put_failure;
1424
1425        return 0;
1426
1427nla_put_failure:
1428        return -EMSGSIZE;
1429}
1430
1431static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
1432        .kind           = "vxlan",
1433        .maxtype        = IFLA_VXLAN_MAX,
1434        .policy         = vxlan_policy,
1435        .priv_size      = sizeof(struct vxlan_dev),
1436        .setup          = vxlan_setup,
1437        .validate       = vxlan_validate,
1438        .newlink        = vxlan_newlink,
1439        .dellink        = vxlan_dellink,
1440        .get_size       = vxlan_get_size,
1441        .fill_info      = vxlan_fill_info,
1442};
1443
1444static __net_init int vxlan_init_net(struct net *net)
1445{
1446        struct vxlan_net *vn = net_generic(net, vxlan_net_id);
1447        struct sock *sk;
1448        struct sockaddr_in vxlan_addr = {
1449                .sin_family = AF_INET,
1450                .sin_addr.s_addr = htonl(INADDR_ANY),
1451        };
1452        int rc;
1453        unsigned h;
1454
1455        /* Create UDP socket for encapsulation receive. */
1456        rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &vn->sock);
1457        if (rc < 0) {
1458                pr_debug("UDP socket create failed\n");
1459                return rc;
1460        }
1461        /* Put in proper namespace */
1462        sk = vn->sock->sk;
1463        sk_change_net(sk, net);
1464
1465        vxlan_addr.sin_port = htons(vxlan_port);
1466
1467        rc = kernel_bind(vn->sock, (struct sockaddr *) &vxlan_addr,
1468                         sizeof(vxlan_addr));
1469        if (rc < 0) {
1470                pr_debug("bind for UDP socket %pI4:%u (%d)\n",
1471                         &vxlan_addr.sin_addr, ntohs(vxlan_addr.sin_port), rc);
1472                sk_release_kernel(sk);
1473                vn->sock = NULL;
1474                return rc;
1475        }
1476
1477        /* Disable multicast loopback */
1478        inet_sk(sk)->mc_loop = 0;
1479
1480        /* Mark socket as an encapsulation socket. */
1481        udp_sk(sk)->encap_type = 1;
1482        udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv;
1483        udp_encap_enable();
1484
1485        for (h = 0; h < VNI_HASH_SIZE; ++h)
1486                INIT_HLIST_HEAD(&vn->vni_list[h]);
1487
1488        return 0;
1489}
1490
1491static __net_exit void vxlan_exit_net(struct net *net)
1492{
1493        struct vxlan_net *vn = net_generic(net, vxlan_net_id);
1494        struct vxlan_dev *vxlan;
1495        struct hlist_node *pos;
1496        unsigned h;
1497
1498        rtnl_lock();
1499        for (h = 0; h < VNI_HASH_SIZE; ++h)
1500                hlist_for_each_entry(vxlan, pos, &vn->vni_list[h], hlist)
1501                        dev_close(vxlan->dev);
1502        rtnl_unlock();
1503
1504        if (vn->sock) {
1505                sk_release_kernel(vn->sock->sk);
1506                vn->sock = NULL;
1507        }
1508}
1509
1510static struct pernet_operations vxlan_net_ops = {
1511        .init = vxlan_init_net,
1512        .exit = vxlan_exit_net,
1513        .id   = &vxlan_net_id,
1514        .size = sizeof(struct vxlan_net),
1515};
1516
1517static int __init vxlan_init_module(void)
1518{
1519        int rc;
1520
1521        get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));
1522
1523        rc = register_pernet_device(&vxlan_net_ops);
1524        if (rc)
1525                goto out1;
1526
1527        rc = rtnl_link_register(&vxlan_link_ops);
1528        if (rc)
1529                goto out2;
1530
1531        return 0;
1532
1533out2:
1534        unregister_pernet_device(&vxlan_net_ops);
1535out1:
1536        return rc;
1537}
1538module_init(vxlan_init_module);
1539
1540static void __exit vxlan_cleanup_module(void)
1541{
1542        rtnl_link_unregister(&vxlan_link_ops);
1543        unregister_pernet_device(&vxlan_net_ops);
1544}
1545module_exit(vxlan_cleanup_module);
1546
1547MODULE_LICENSE("GPL");
1548MODULE_VERSION(VXLAN_VERSION);
1549MODULE_AUTHOR("Stephen Hemminger <shemminger@vyatta.com>");
1550MODULE_ALIAS_RTNL_LINK("vxlan");
1551
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.