linux/drivers/net/vxlan.c
<<
>>
Prefs
   1/*
   2 * VXLAN: Virtual eXtensible Local Area Network
   3 *
   4 * Copyright (c) 2012 Vyatta Inc.
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License version 2 as
   8 * published by the Free Software Foundation.
   9 *
  10 * TODO
  11 *  - use IANA UDP port number (when defined)
  12 *  - IPv6 (not in RFC)
  13 */
  14
  15#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  16
  17#include <linux/kernel.h>
  18#include <linux/types.h>
  19#include <linux/module.h>
  20#include <linux/errno.h>
  21#include <linux/slab.h>
  22#include <linux/skbuff.h>
  23#include <linux/rculist.h>
  24#include <linux/netdevice.h>
  25#include <linux/in.h>
  26#include <linux/ip.h>
  27#include <linux/udp.h>
  28#include <linux/igmp.h>
  29#include <linux/etherdevice.h>
  30#include <linux/if_ether.h>
  31#include <linux/hash.h>
  32#include <linux/ethtool.h>
  33#include <net/arp.h>
  34#include <net/ndisc.h>
  35#include <net/ip.h>
  36#include <net/ipip.h>
  37#include <net/icmp.h>
  38#include <net/udp.h>
  39#include <net/rtnetlink.h>
  40#include <net/route.h>
  41#include <net/dsfield.h>
  42#include <net/inet_ecn.h>
  43#include <net/net_namespace.h>
  44#include <net/netns/generic.h>
  45
  46#define VXLAN_VERSION   "0.1"
  47
  48#define VNI_HASH_BITS   10
  49#define VNI_HASH_SIZE   (1<<VNI_HASH_BITS)
  50#define FDB_HASH_BITS   8
  51#define FDB_HASH_SIZE   (1<<FDB_HASH_BITS)
  52#define FDB_AGE_DEFAULT 300 /* 5 min */
  53#define FDB_AGE_INTERVAL (10 * HZ)      /* rescan interval */
  54
  55#define VXLAN_N_VID     (1u << 24)
  56#define VXLAN_VID_MASK  (VXLAN_N_VID - 1)
  57/* IP header + UDP + VXLAN + Ethernet header */
  58#define VXLAN_HEADROOM (20 + 8 + 8 + 14)
  59
  60#define VXLAN_FLAGS 0x08000000  /* struct vxlanhdr.vx_flags required value. */
  61
  62/* VXLAN protocol header */
  63struct vxlanhdr {
  64        __be32 vx_flags;
  65        __be32 vx_vni;
  66};
  67
  68/* UDP port for VXLAN traffic. */
  69static unsigned int vxlan_port __read_mostly = 8472;
  70module_param_named(udp_port, vxlan_port, uint, 0444);
  71MODULE_PARM_DESC(udp_port, "Destination UDP port");
  72
  73static bool log_ecn_error = true;
  74module_param(log_ecn_error, bool, 0644);
  75MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
  76
  77/* per-net private data for this module */
  78static unsigned int vxlan_net_id;
  79struct vxlan_net {
  80        struct socket     *sock;        /* UDP encap socket */
  81        struct hlist_head vni_list[VNI_HASH_SIZE];
  82};
  83
  84/* Forwarding table entry */
  85struct vxlan_fdb {
  86        struct hlist_node hlist;        /* linked list of entries */
  87        struct rcu_head   rcu;
  88        unsigned long     updated;      /* jiffies */
  89        unsigned long     used;
  90        __be32            remote_ip;
  91        u16               state;        /* see ndm_state */
  92        u8                eth_addr[ETH_ALEN];
  93};
  94
  95/* Per-cpu network traffic stats */
  96struct vxlan_stats {
  97        u64                     rx_packets;
  98        u64                     rx_bytes;
  99        u64                     tx_packets;
 100        u64                     tx_bytes;
 101        struct u64_stats_sync   syncp;
 102};
 103
 104/* Pseudo network device */
 105struct vxlan_dev {
 106        struct hlist_node hlist;
 107        struct net_device *dev;
 108        struct vxlan_stats __percpu *stats;
 109        __u32             vni;          /* virtual network id */
 110        __be32            gaddr;        /* multicast group */
 111        __be32            saddr;        /* source address */
 112        unsigned int      link;         /* link to multicast over */
 113        __u16             port_min;     /* source port range */
 114        __u16             port_max;
 115        __u8              tos;          /* TOS override */
 116        __u8              ttl;
 117        u32               flags;        /* VXLAN_F_* below */
 118
 119        unsigned long     age_interval;
 120        struct timer_list age_timer;
 121        spinlock_t        hash_lock;
 122        unsigned int      addrcnt;
 123        unsigned int      addrmax;
 124
 125        struct hlist_head fdb_head[FDB_HASH_SIZE];
 126};
 127
 128#define VXLAN_F_LEARN   0x01
 129#define VXLAN_F_PROXY   0x02
 130#define VXLAN_F_RSC     0x04
 131#define VXLAN_F_L2MISS  0x08
 132#define VXLAN_F_L3MISS  0x10
 133
 134/* salt for hash table */
 135static u32 vxlan_salt __read_mostly;
 136
 137static inline struct hlist_head *vni_head(struct net *net, u32 id)
 138{
 139        struct vxlan_net *vn = net_generic(net, vxlan_net_id);
 140
 141        return &vn->vni_list[hash_32(id, VNI_HASH_BITS)];
 142}
 143
 144/* Look up VNI in a per net namespace table */
 145static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id)
 146{
 147        struct vxlan_dev *vxlan;
 148
 149        hlist_for_each_entry_rcu(vxlan, vni_head(net, id), hlist) {
 150                if (vxlan->vni == id)
 151                        return vxlan;
 152        }
 153
 154        return NULL;
 155}
 156
 157/* Fill in neighbour message in skbuff. */
 158static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
 159                           const struct vxlan_fdb *fdb,
 160                           u32 portid, u32 seq, int type, unsigned int flags)
 161{
 162        unsigned long now = jiffies;
 163        struct nda_cacheinfo ci;
 164        struct nlmsghdr *nlh;
 165        struct ndmsg *ndm;
 166        bool send_ip, send_eth;
 167
 168        nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
 169        if (nlh == NULL)
 170                return -EMSGSIZE;
 171
 172        ndm = nlmsg_data(nlh);
 173        memset(ndm, 0, sizeof(*ndm));
 174
 175        send_eth = send_ip = true;
 176
 177        if (type == RTM_GETNEIGH) {
 178                ndm->ndm_family = AF_INET;
 179                send_ip = fdb->remote_ip != 0;
 180                send_eth = !is_zero_ether_addr(fdb->eth_addr);
 181        } else
 182                ndm->ndm_family = AF_BRIDGE;
 183        ndm->ndm_state = fdb->state;
 184        ndm->ndm_ifindex = vxlan->dev->ifindex;
 185        ndm->ndm_flags = NTF_SELF;
 186        ndm->ndm_type = NDA_DST;
 187
 188        if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
 189                goto nla_put_failure;
 190
 191        if (send_ip && nla_put_be32(skb, NDA_DST, fdb->remote_ip))
 192                goto nla_put_failure;
 193
 194        ci.ndm_used      = jiffies_to_clock_t(now - fdb->used);
 195        ci.ndm_confirmed = 0;
 196        ci.ndm_updated   = jiffies_to_clock_t(now - fdb->updated);
 197        ci.ndm_refcnt    = 0;
 198
 199        if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
 200                goto nla_put_failure;
 201
 202        return nlmsg_end(skb, nlh);
 203
 204nla_put_failure:
 205        nlmsg_cancel(skb, nlh);
 206        return -EMSGSIZE;
 207}
 208
 209static inline size_t vxlan_nlmsg_size(void)
 210{
 211        return NLMSG_ALIGN(sizeof(struct ndmsg))
 212                + nla_total_size(ETH_ALEN) /* NDA_LLADDR */
 213                + nla_total_size(sizeof(__be32)) /* NDA_DST */
 214                + nla_total_size(sizeof(struct nda_cacheinfo));
 215}
 216
 217static void vxlan_fdb_notify(struct vxlan_dev *vxlan,
 218                             const struct vxlan_fdb *fdb, int type)
 219{
 220        struct net *net = dev_net(vxlan->dev);
 221        struct sk_buff *skb;
 222        int err = -ENOBUFS;
 223
 224        skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC);
 225        if (skb == NULL)
 226                goto errout;
 227
 228        err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0);
 229        if (err < 0) {
 230                /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
 231                WARN_ON(err == -EMSGSIZE);
 232                kfree_skb(skb);
 233                goto errout;
 234        }
 235
 236        rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
 237        return;
 238errout:
 239        if (err < 0)
 240                rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
 241}
 242
 243static void vxlan_ip_miss(struct net_device *dev, __be32 ipa)
 244{
 245        struct vxlan_dev *vxlan = netdev_priv(dev);
 246        struct vxlan_fdb f;
 247
 248        memset(&f, 0, sizeof f);
 249        f.state = NUD_STALE;
 250        f.remote_ip = ipa; /* goes to NDA_DST */
 251
 252        vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH);
 253}
 254
 255static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN])
 256{
 257        struct vxlan_fdb        f;
 258
 259        memset(&f, 0, sizeof f);
 260        f.state = NUD_STALE;
 261        memcpy(f.eth_addr, eth_addr, ETH_ALEN);
 262
 263        vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH);
 264}
 265
 266/* Hash Ethernet address */
 267static u32 eth_hash(const unsigned char *addr)
 268{
 269        u64 value = get_unaligned((u64 *)addr);
 270
 271        /* only want 6 bytes */
 272#ifdef __BIG_ENDIAN
 273        value >>= 16;
 274#else
 275        value <<= 16;
 276#endif
 277        return hash_64(value, FDB_HASH_BITS);
 278}
 279
 280/* Hash chain to use given mac address */
 281static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
 282                                                const u8 *mac)
 283{
 284        return &vxlan->fdb_head[eth_hash(mac)];
 285}
 286
 287/* Look up Ethernet address in forwarding table */
 288static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
 289                                        const u8 *mac)
 290
 291{
 292        struct hlist_head *head = vxlan_fdb_head(vxlan, mac);
 293        struct vxlan_fdb *f;
 294
 295        hlist_for_each_entry_rcu(f, head, hlist) {
 296                if (compare_ether_addr(mac, f->eth_addr) == 0)
 297                        return f;
 298        }
 299
 300        return NULL;
 301}
 302
 303/* Add new entry to forwarding table -- assumes lock held */
 304static int vxlan_fdb_create(struct vxlan_dev *vxlan,
 305                            const u8 *mac, __be32 ip,
 306                            __u16 state, __u16 flags)
 307{
 308        struct vxlan_fdb *f;
 309        int notify = 0;
 310
 311        f = vxlan_find_mac(vxlan, mac);
 312        if (f) {
 313                if (flags & NLM_F_EXCL) {
 314                        netdev_dbg(vxlan->dev,
 315                                   "lost race to create %pM\n", mac);
 316                        return -EEXIST;
 317                }
 318                if (f->state != state) {
 319                        f->state = state;
 320                        f->updated = jiffies;
 321                        notify = 1;
 322                }
 323        } else {
 324                if (!(flags & NLM_F_CREATE))
 325                        return -ENOENT;
 326
 327                if (vxlan->addrmax && vxlan->addrcnt >= vxlan->addrmax)
 328                        return -ENOSPC;
 329
 330                netdev_dbg(vxlan->dev, "add %pM -> %pI4\n", mac, &ip);
 331                f = kmalloc(sizeof(*f), GFP_ATOMIC);
 332                if (!f)
 333                        return -ENOMEM;
 334
 335                notify = 1;
 336                f->remote_ip = ip;
 337                f->state = state;
 338                f->updated = f->used = jiffies;
 339                memcpy(f->eth_addr, mac, ETH_ALEN);
 340
 341                ++vxlan->addrcnt;
 342                hlist_add_head_rcu(&f->hlist,
 343                                   vxlan_fdb_head(vxlan, mac));
 344        }
 345
 346        if (notify)
 347                vxlan_fdb_notify(vxlan, f, RTM_NEWNEIGH);
 348
 349        return 0;
 350}
 351
 352static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f)
 353{
 354        netdev_dbg(vxlan->dev,
 355                    "delete %pM\n", f->eth_addr);
 356
 357        --vxlan->addrcnt;
 358        vxlan_fdb_notify(vxlan, f, RTM_DELNEIGH);
 359
 360        hlist_del_rcu(&f->hlist);
 361        kfree_rcu(f, rcu);
 362}
 363
 364/* Add static entry (via netlink) */
 365static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 366                         struct net_device *dev,
 367                         const unsigned char *addr, u16 flags)
 368{
 369        struct vxlan_dev *vxlan = netdev_priv(dev);
 370        __be32 ip;
 371        int err;
 372
 373        if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) {
 374                pr_info("RTM_NEWNEIGH with invalid state %#x\n",
 375                        ndm->ndm_state);
 376                return -EINVAL;
 377        }
 378
 379        if (tb[NDA_DST] == NULL)
 380                return -EINVAL;
 381
 382        if (nla_len(tb[NDA_DST]) != sizeof(__be32))
 383                return -EAFNOSUPPORT;
 384
 385        ip = nla_get_be32(tb[NDA_DST]);
 386
 387        spin_lock_bh(&vxlan->hash_lock);
 388        err = vxlan_fdb_create(vxlan, addr, ip, ndm->ndm_state, flags);
 389        spin_unlock_bh(&vxlan->hash_lock);
 390
 391        return err;
 392}
 393
 394/* Delete entry (via netlink) */
 395static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 396                            struct net_device *dev,
 397                            const unsigned char *addr)
 398{
 399        struct vxlan_dev *vxlan = netdev_priv(dev);
 400        struct vxlan_fdb *f;
 401        int err = -ENOENT;
 402
 403        spin_lock_bh(&vxlan->hash_lock);
 404        f = vxlan_find_mac(vxlan, addr);
 405        if (f) {
 406                vxlan_fdb_destroy(vxlan, f);
 407                err = 0;
 408        }
 409        spin_unlock_bh(&vxlan->hash_lock);
 410
 411        return err;
 412}
 413
 414/* Dump forwarding table */
 415static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
 416                          struct net_device *dev, int idx)
 417{
 418        struct vxlan_dev *vxlan = netdev_priv(dev);
 419        unsigned int h;
 420
 421        for (h = 0; h < FDB_HASH_SIZE; ++h) {
 422                struct vxlan_fdb *f;
 423                int err;
 424
 425                hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) {
 426                        if (idx < cb->args[0])
 427                                goto skip;
 428
 429                        err = vxlan_fdb_info(skb, vxlan, f,
 430                                             NETLINK_CB(cb->skb).portid,
 431                                             cb->nlh->nlmsg_seq,
 432                                             RTM_NEWNEIGH,
 433                                             NLM_F_MULTI);
 434                        if (err < 0)
 435                                break;
 436skip:
 437                        ++idx;
 438                }
 439        }
 440
 441        return idx;
 442}
 443
 444/* Watch incoming packets to learn mapping between Ethernet address
 445 * and Tunnel endpoint.
 446 */
 447static void vxlan_snoop(struct net_device *dev,
 448                        __be32 src_ip, const u8 *src_mac)
 449{
 450        struct vxlan_dev *vxlan = netdev_priv(dev);
 451        struct vxlan_fdb *f;
 452        int err;
 453
 454        f = vxlan_find_mac(vxlan, src_mac);
 455        if (likely(f)) {
 456                f->used = jiffies;
 457                if (likely(f->remote_ip == src_ip))
 458                        return;
 459
 460                if (net_ratelimit())
 461                        netdev_info(dev,
 462                                    "%pM migrated from %pI4 to %pI4\n",
 463                                    src_mac, &f->remote_ip, &src_ip);
 464
 465                f->remote_ip = src_ip;
 466                f->updated = jiffies;
 467        } else {
 468                /* learned new entry */
 469                spin_lock(&vxlan->hash_lock);
 470                err = vxlan_fdb_create(vxlan, src_mac, src_ip,
 471                                       NUD_REACHABLE,
 472                                       NLM_F_EXCL|NLM_F_CREATE);
 473                spin_unlock(&vxlan->hash_lock);
 474        }
 475}
 476
 477
 478/* See if multicast group is already in use by other ID */
 479static bool vxlan_group_used(struct vxlan_net *vn,
 480                             const struct vxlan_dev *this)
 481{
 482        const struct vxlan_dev *vxlan;
 483        unsigned h;
 484
 485        for (h = 0; h < VNI_HASH_SIZE; ++h)
 486                hlist_for_each_entry(vxlan, &vn->vni_list[h], hlist) {
 487                        if (vxlan == this)
 488                                continue;
 489
 490                        if (!netif_running(vxlan->dev))
 491                                continue;
 492
 493                        if (vxlan->gaddr == this->gaddr)
 494                                return true;
 495                }
 496
 497        return false;
 498}
 499
 500/* kernel equivalent to IP_ADD_MEMBERSHIP */
 501static int vxlan_join_group(struct net_device *dev)
 502{
 503        struct vxlan_dev *vxlan = netdev_priv(dev);
 504        struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
 505        struct sock *sk = vn->sock->sk;
 506        struct ip_mreqn mreq = {
 507                .imr_multiaddr.s_addr   = vxlan->gaddr,
 508                .imr_ifindex            = vxlan->link,
 509        };
 510        int err;
 511
 512        /* Already a member of group */
 513        if (vxlan_group_used(vn, vxlan))
 514                return 0;
 515
 516        /* Need to drop RTNL to call multicast join */
 517        rtnl_unlock();
 518        lock_sock(sk);
 519        err = ip_mc_join_group(sk, &mreq);
 520        release_sock(sk);
 521        rtnl_lock();
 522
 523        return err;
 524}
 525
 526
 527/* kernel equivalent to IP_DROP_MEMBERSHIP */
 528static int vxlan_leave_group(struct net_device *dev)
 529{
 530        struct vxlan_dev *vxlan = netdev_priv(dev);
 531        struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
 532        int err = 0;
 533        struct sock *sk = vn->sock->sk;
 534        struct ip_mreqn mreq = {
 535                .imr_multiaddr.s_addr   = vxlan->gaddr,
 536                .imr_ifindex            = vxlan->link,
 537        };
 538
 539        /* Only leave group when last vxlan is done. */
 540        if (vxlan_group_used(vn, vxlan))
 541                return 0;
 542
 543        /* Need to drop RTNL to call multicast leave */
 544        rtnl_unlock();
 545        lock_sock(sk);
 546        err = ip_mc_leave_group(sk, &mreq);
 547        release_sock(sk);
 548        rtnl_lock();
 549
 550        return err;
 551}
 552
 553/* Callback from net/ipv4/udp.c to receive packets */
 554static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 555{
 556        struct iphdr *oip;
 557        struct vxlanhdr *vxh;
 558        struct vxlan_dev *vxlan;
 559        struct vxlan_stats *stats;
 560        __u32 vni;
 561        int err;
 562
 563        /* pop off outer UDP header */
 564        __skb_pull(skb, sizeof(struct udphdr));
 565
 566        /* Need Vxlan and inner Ethernet header to be present */
 567        if (!pskb_may_pull(skb, sizeof(struct vxlanhdr)))
 568                goto error;
 569
 570        /* Drop packets with reserved bits set */
 571        vxh = (struct vxlanhdr *) skb->data;
 572        if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
 573            (vxh->vx_vni & htonl(0xff))) {
 574                netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
 575                           ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));
 576                goto error;
 577        }
 578
 579        __skb_pull(skb, sizeof(struct vxlanhdr));
 580
 581        /* Is this VNI defined? */
 582        vni = ntohl(vxh->vx_vni) >> 8;
 583        vxlan = vxlan_find_vni(sock_net(sk), vni);
 584        if (!vxlan) {
 585                netdev_dbg(skb->dev, "unknown vni %d\n", vni);
 586                goto drop;
 587        }
 588
 589        if (!pskb_may_pull(skb, ETH_HLEN)) {
 590                vxlan->dev->stats.rx_length_errors++;
 591                vxlan->dev->stats.rx_errors++;
 592                goto drop;
 593        }
 594
 595        skb_reset_mac_header(skb);
 596
 597        /* Re-examine inner Ethernet packet */
 598        oip = ip_hdr(skb);
 599        skb->protocol = eth_type_trans(skb, vxlan->dev);
 600
 601        /* Ignore packet loops (and multicast echo) */
 602        if (compare_ether_addr(eth_hdr(skb)->h_source,
 603                               vxlan->dev->dev_addr) == 0)
 604                goto drop;
 605
 606        if (vxlan->flags & VXLAN_F_LEARN)
 607                vxlan_snoop(skb->dev, oip->saddr, eth_hdr(skb)->h_source);
 608
 609        __skb_tunnel_rx(skb, vxlan->dev);
 610        skb_reset_network_header(skb);
 611
 612        /* If the NIC driver gave us an encapsulated packet with
 613         * CHECKSUM_UNNECESSARY and Rx checksum feature is enabled,
 614         * leave the CHECKSUM_UNNECESSARY, the device checksummed it
 615         * for us. Otherwise force the upper layers to verify it.
 616         */
 617        if (skb->ip_summed != CHECKSUM_UNNECESSARY || !skb->encapsulation ||
 618            !(vxlan->dev->features & NETIF_F_RXCSUM))
 619                skb->ip_summed = CHECKSUM_NONE;
 620
 621        skb->encapsulation = 0;
 622
 623        err = IP_ECN_decapsulate(oip, skb);
 624        if (unlikely(err)) {
 625                if (log_ecn_error)
 626                        net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
 627                                             &oip->saddr, oip->tos);
 628                if (err > 1) {
 629                        ++vxlan->dev->stats.rx_frame_errors;
 630                        ++vxlan->dev->stats.rx_errors;
 631                        goto drop;
 632                }
 633        }
 634
 635        stats = this_cpu_ptr(vxlan->stats);
 636        u64_stats_update_begin(&stats->syncp);
 637        stats->rx_packets++;
 638        stats->rx_bytes += skb->len;
 639        u64_stats_update_end(&stats->syncp);
 640
 641        netif_rx(skb);
 642
 643        return 0;
 644error:
 645        /* Put UDP header back */
 646        __skb_push(skb, sizeof(struct udphdr));
 647
 648        return 1;
 649drop:
 650        /* Consume bad packet */
 651        kfree_skb(skb);
 652        return 0;
 653}
 654
 655static int arp_reduce(struct net_device *dev, struct sk_buff *skb)
 656{
 657        struct vxlan_dev *vxlan = netdev_priv(dev);
 658        struct arphdr *parp;
 659        u8 *arpptr, *sha;
 660        __be32 sip, tip;
 661        struct neighbour *n;
 662
 663        if (dev->flags & IFF_NOARP)
 664                goto out;
 665
 666        if (!pskb_may_pull(skb, arp_hdr_len(dev))) {
 667                dev->stats.tx_dropped++;
 668                goto out;
 669        }
 670        parp = arp_hdr(skb);
 671
 672        if ((parp->ar_hrd != htons(ARPHRD_ETHER) &&
 673             parp->ar_hrd != htons(ARPHRD_IEEE802)) ||
 674            parp->ar_pro != htons(ETH_P_IP) ||
 675            parp->ar_op != htons(ARPOP_REQUEST) ||
 676            parp->ar_hln != dev->addr_len ||
 677            parp->ar_pln != 4)
 678                goto out;
 679        arpptr = (u8 *)parp + sizeof(struct arphdr);
 680        sha = arpptr;
 681        arpptr += dev->addr_len;        /* sha */
 682        memcpy(&sip, arpptr, sizeof(sip));
 683        arpptr += sizeof(sip);
 684        arpptr += dev->addr_len;        /* tha */
 685        memcpy(&tip, arpptr, sizeof(tip));
 686
 687        if (ipv4_is_loopback(tip) ||
 688            ipv4_is_multicast(tip))
 689                goto out;
 690
 691        n = neigh_lookup(&arp_tbl, &tip, dev);
 692
 693        if (n) {
 694                struct vxlan_dev *vxlan = netdev_priv(dev);
 695                struct vxlan_fdb *f;
 696                struct sk_buff  *reply;
 697
 698                if (!(n->nud_state & NUD_CONNECTED)) {
 699                        neigh_release(n);
 700                        goto out;
 701                }
 702
 703                f = vxlan_find_mac(vxlan, n->ha);
 704                if (f && f->remote_ip == 0) {
 705                        /* bridge-local neighbor */
 706                        neigh_release(n);
 707                        goto out;
 708                }
 709
 710                reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
 711                                n->ha, sha);
 712
 713                neigh_release(n);
 714
 715                skb_reset_mac_header(reply);
 716                __skb_pull(reply, skb_network_offset(reply));
 717                reply->ip_summed = CHECKSUM_UNNECESSARY;
 718                reply->pkt_type = PACKET_HOST;
 719
 720                if (netif_rx_ni(reply) == NET_RX_DROP)
 721                        dev->stats.rx_dropped++;
 722        } else if (vxlan->flags & VXLAN_F_L3MISS)
 723                vxlan_ip_miss(dev, tip);
 724out:
 725        consume_skb(skb);
 726        return NETDEV_TX_OK;
 727}
 728
 729static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
 730{
 731        struct vxlan_dev *vxlan = netdev_priv(dev);
 732        struct neighbour *n;
 733        struct iphdr *pip;
 734
 735        if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
 736                return false;
 737
 738        n = NULL;
 739        switch (ntohs(eth_hdr(skb)->h_proto)) {
 740        case ETH_P_IP:
 741                if (!pskb_may_pull(skb, sizeof(struct iphdr)))
 742                        return false;
 743                pip = ip_hdr(skb);
 744                n = neigh_lookup(&arp_tbl, &pip->daddr, dev);
 745                break;
 746        default:
 747                return false;
 748        }
 749
 750        if (n) {
 751                bool diff;
 752
 753                diff = compare_ether_addr(eth_hdr(skb)->h_dest, n->ha) != 0;
 754                if (diff) {
 755                        memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
 756                                dev->addr_len);
 757                        memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len);
 758                }
 759                neigh_release(n);
 760                return diff;
 761        } else if (vxlan->flags & VXLAN_F_L3MISS)
 762                vxlan_ip_miss(dev, pip->daddr);
 763        return false;
 764}
 765
 766/* Extract dsfield from inner protocol */
 767static inline u8 vxlan_get_dsfield(const struct iphdr *iph,
 768                                   const struct sk_buff *skb)
 769{
 770        if (skb->protocol == htons(ETH_P_IP))
 771                return iph->tos;
 772        else if (skb->protocol == htons(ETH_P_IPV6))
 773                return ipv6_get_dsfield((const struct ipv6hdr *)iph);
 774        else
 775                return 0;
 776}
 777
 778/* Propogate ECN bits out */
 779static inline u8 vxlan_ecn_encap(u8 tos,
 780                                 const struct iphdr *iph,
 781                                 const struct sk_buff *skb)
 782{
 783        u8 inner = vxlan_get_dsfield(iph, skb);
 784
 785        return INET_ECN_encapsulate(tos, inner);
 786}
 787
 788static void vxlan_sock_free(struct sk_buff *skb)
 789{
 790        sock_put(skb->sk);
 791}
 792
 793/* On transmit, associate with the tunnel socket */
 794static void vxlan_set_owner(struct net_device *dev, struct sk_buff *skb)
 795{
 796        struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
 797        struct sock *sk = vn->sock->sk;
 798
 799        skb_orphan(skb);
 800        sock_hold(sk);
 801        skb->sk = sk;
 802        skb->destructor = vxlan_sock_free;
 803}
 804
 805/* Compute source port for outgoing packet
 806 *   first choice to use L4 flow hash since it will spread
 807 *     better and maybe available from hardware
 808 *   secondary choice is to use jhash on the Ethernet header
 809 */
 810static u16 vxlan_src_port(const struct vxlan_dev *vxlan, struct sk_buff *skb)
 811{
 812        unsigned int range = (vxlan->port_max - vxlan->port_min) + 1;
 813        u32 hash;
 814
 815        hash = skb_get_rxhash(skb);
 816        if (!hash)
 817                hash = jhash(skb->data, 2 * ETH_ALEN,
 818                             (__force u32) skb->protocol);
 819
 820        return (((u64) hash * range) >> 32) + vxlan->port_min;
 821}
 822
 823/* Transmit local packets over Vxlan
 824 *
 825 * Outer IP header inherits ECN and DF from inner header.
 826 * Outer UDP destination is the VXLAN assigned port.
 827 *           source port is based on hash of flow
 828 */
 829static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 830{
 831        struct vxlan_dev *vxlan = netdev_priv(dev);
 832        struct rtable *rt;
 833        const struct iphdr *old_iph;
 834        struct ethhdr *eth;
 835        struct iphdr *iph;
 836        struct vxlanhdr *vxh;
 837        struct udphdr *uh;
 838        struct flowi4 fl4;
 839        unsigned int pkt_len = skb->len;
 840        __be32 dst;
 841        __u16 src_port;
 842        __be16 df = 0;
 843        __u8 tos, ttl;
 844        int err;
 845        bool did_rsc = false;
 846        const struct vxlan_fdb *f;
 847
 848        skb_reset_mac_header(skb);
 849        eth = eth_hdr(skb);
 850
 851        if ((vxlan->flags & VXLAN_F_PROXY) && ntohs(eth->h_proto) == ETH_P_ARP)
 852                return arp_reduce(dev, skb);
 853        else if ((vxlan->flags&VXLAN_F_RSC) && ntohs(eth->h_proto) == ETH_P_IP)
 854                did_rsc = route_shortcircuit(dev, skb);
 855
 856        f = vxlan_find_mac(vxlan, eth->h_dest);
 857        if (f == NULL) {
 858                did_rsc = false;
 859                dst = vxlan->gaddr;
 860                if (!dst && (vxlan->flags & VXLAN_F_L2MISS) &&
 861                    !is_multicast_ether_addr(eth->h_dest))
 862                        vxlan_fdb_miss(vxlan, eth->h_dest);
 863        } else
 864                dst = f->remote_ip;
 865
 866        if (!dst) {
 867                if (did_rsc) {
 868                        __skb_pull(skb, skb_network_offset(skb));
 869                        skb->ip_summed = CHECKSUM_NONE;
 870                        skb->pkt_type = PACKET_HOST;
 871
 872                        /* short-circuited back to local bridge */
 873                        if (netif_rx(skb) == NET_RX_SUCCESS) {
 874                                struct vxlan_stats *stats =
 875                                                this_cpu_ptr(vxlan->stats);
 876
 877                                u64_stats_update_begin(&stats->syncp);
 878                                stats->tx_packets++;
 879                                stats->tx_bytes += pkt_len;
 880                                u64_stats_update_end(&stats->syncp);
 881                        } else {
 882                                dev->stats.tx_errors++;
 883                                dev->stats.tx_aborted_errors++;
 884                        }
 885                        return NETDEV_TX_OK;
 886                }
 887                goto drop;
 888        }
 889
 890        if (!skb->encapsulation) {
 891                skb_reset_inner_headers(skb);
 892                skb->encapsulation = 1;
 893        }
 894
 895        /* Need space for new headers (invalidates iph ptr) */
 896        if (skb_cow_head(skb, VXLAN_HEADROOM))
 897                goto drop;
 898
 899        old_iph = ip_hdr(skb);
 900
 901        ttl = vxlan->ttl;
 902        if (!ttl && IN_MULTICAST(ntohl(dst)))
 903                ttl = 1;
 904
 905        tos = vxlan->tos;
 906        if (tos == 1)
 907                tos = vxlan_get_dsfield(old_iph, skb);
 908
 909        src_port = vxlan_src_port(vxlan, skb);
 910
 911        memset(&fl4, 0, sizeof(fl4));
 912        fl4.flowi4_oif = vxlan->link;
 913        fl4.flowi4_tos = RT_TOS(tos);
 914        fl4.daddr = dst;
 915        fl4.saddr = vxlan->saddr;
 916
 917        rt = ip_route_output_key(dev_net(dev), &fl4);
 918        if (IS_ERR(rt)) {
 919                netdev_dbg(dev, "no route to %pI4\n", &dst);
 920                dev->stats.tx_carrier_errors++;
 921                goto tx_error;
 922        }
 923
 924        if (rt->dst.dev == dev) {
 925                netdev_dbg(dev, "circular route to %pI4\n", &dst);
 926                ip_rt_put(rt);
 927                dev->stats.collisions++;
 928                goto tx_error;
 929        }
 930
 931        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 932        IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
 933                              IPSKB_REROUTED);
 934        skb_dst_drop(skb);
 935        skb_dst_set(skb, &rt->dst);
 936
 937        vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
 938        vxh->vx_flags = htonl(VXLAN_FLAGS);
 939        vxh->vx_vni = htonl(vxlan->vni << 8);
 940
 941        __skb_push(skb, sizeof(*uh));
 942        skb_reset_transport_header(skb);
 943        uh = udp_hdr(skb);
 944
 945        uh->dest = htons(vxlan_port);
 946        uh->source = htons(src_port);
 947
 948        uh->len = htons(skb->len);
 949        uh->check = 0;
 950
 951        __skb_push(skb, sizeof(*iph));
 952        skb_reset_network_header(skb);
 953        iph             = ip_hdr(skb);
 954        iph->version    = 4;
 955        iph->ihl        = sizeof(struct iphdr) >> 2;
 956        iph->frag_off   = df;
 957        iph->protocol   = IPPROTO_UDP;
 958        iph->tos        = vxlan_ecn_encap(tos, old_iph, skb);
 959        iph->daddr      = dst;
 960        iph->saddr      = fl4.saddr;
 961        iph->ttl        = ttl ? : ip4_dst_hoplimit(&rt->dst);
 962        tunnel_ip_select_ident(skb, old_iph, &rt->dst);
 963
 964        nf_reset(skb);
 965
 966        vxlan_set_owner(dev, skb);
 967
 968        /* See iptunnel_xmit() */
 969        if (skb->ip_summed != CHECKSUM_PARTIAL)
 970                skb->ip_summed = CHECKSUM_NONE;
 971
 972        err = ip_local_out(skb);
 973        if (likely(net_xmit_eval(err) == 0)) {
 974                struct vxlan_stats *stats = this_cpu_ptr(vxlan->stats);
 975
 976                u64_stats_update_begin(&stats->syncp);
 977                stats->tx_packets++;
 978                stats->tx_bytes += pkt_len;
 979                u64_stats_update_end(&stats->syncp);
 980        } else {
 981                dev->stats.tx_errors++;
 982                dev->stats.tx_aborted_errors++;
 983        }
 984        return NETDEV_TX_OK;
 985
 986drop:
 987        dev->stats.tx_dropped++;
 988        goto tx_free;
 989
 990tx_error:
 991        dev->stats.tx_errors++;
 992tx_free:
 993        dev_kfree_skb(skb);
 994        return NETDEV_TX_OK;
 995}
 996
 997/* Walk the forwarding table and purge stale entries */
 998static void vxlan_cleanup(unsigned long arg)
 999{
1000        struct vxlan_dev *vxlan = (struct vxlan_dev *) arg;
1001        unsigned long next_timer = jiffies + FDB_AGE_INTERVAL;
1002        unsigned int h;
1003
1004        if (!netif_running(vxlan->dev))
1005                return;
1006
1007        spin_lock_bh(&vxlan->hash_lock);
1008        for (h = 0; h < FDB_HASH_SIZE; ++h) {
1009                struct hlist_node *p, *n;
1010                hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
1011                        struct vxlan_fdb *f
1012                                = container_of(p, struct vxlan_fdb, hlist);
1013                        unsigned long timeout;
1014
1015                        if (f->state & NUD_PERMANENT)
1016                                continue;
1017
1018                        timeout = f->used + vxlan->age_interval * HZ;
1019                        if (time_before_eq(timeout, jiffies)) {
1020                                netdev_dbg(vxlan->dev,
1021                                           "garbage collect %pM\n",
1022                                           f->eth_addr);
1023                                f->state = NUD_STALE;
1024                                vxlan_fdb_destroy(vxlan, f);
1025                        } else if (time_before(timeout, next_timer))
1026                                next_timer = timeout;
1027                }
1028        }
1029        spin_unlock_bh(&vxlan->hash_lock);
1030
1031        mod_timer(&vxlan->age_timer, next_timer);
1032}
1033
1034/* Setup stats when device is created */
1035static int vxlan_init(struct net_device *dev)
1036{
1037        struct vxlan_dev *vxlan = netdev_priv(dev);
1038
1039        vxlan->stats = alloc_percpu(struct vxlan_stats);
1040        if (!vxlan->stats)
1041                return -ENOMEM;
1042
1043        return 0;
1044}
1045
1046/* Start ageing timer and join group when device is brought up */
1047static int vxlan_open(struct net_device *dev)
1048{
1049        struct vxlan_dev *vxlan = netdev_priv(dev);
1050        int err;
1051
1052        if (vxlan->gaddr) {
1053                err = vxlan_join_group(dev);
1054                if (err)
1055                        return err;
1056        }
1057
1058        if (vxlan->age_interval)
1059                mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);
1060
1061        return 0;
1062}
1063
1064/* Purge the forwarding table */
1065static void vxlan_flush(struct vxlan_dev *vxlan)
1066{
1067        unsigned h;
1068
1069        spin_lock_bh(&vxlan->hash_lock);
1070        for (h = 0; h < FDB_HASH_SIZE; ++h) {
1071                struct hlist_node *p, *n;
1072                hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
1073                        struct vxlan_fdb *f
1074                                = container_of(p, struct vxlan_fdb, hlist);
1075                        vxlan_fdb_destroy(vxlan, f);
1076                }
1077        }
1078        spin_unlock_bh(&vxlan->hash_lock);
1079}
1080
1081/* Cleanup timer and forwarding table on shutdown */
1082static int vxlan_stop(struct net_device *dev)
1083{
1084        struct vxlan_dev *vxlan = netdev_priv(dev);
1085
1086        if (vxlan->gaddr)
1087                vxlan_leave_group(dev);
1088
1089        del_timer_sync(&vxlan->age_timer);
1090
1091        vxlan_flush(vxlan);
1092
1093        return 0;
1094}
1095
1096/* Merge per-cpu statistics */
1097static struct rtnl_link_stats64 *vxlan_stats64(struct net_device *dev,
1098                                               struct rtnl_link_stats64 *stats)
1099{
1100        struct vxlan_dev *vxlan = netdev_priv(dev);
1101        struct vxlan_stats tmp, sum = { 0 };
1102        unsigned int cpu;
1103
1104        for_each_possible_cpu(cpu) {
1105                unsigned int start;
1106                const struct vxlan_stats *stats
1107                        = per_cpu_ptr(vxlan->stats, cpu);
1108
1109                do {
1110                        start = u64_stats_fetch_begin_bh(&stats->syncp);
1111                        memcpy(&tmp, stats, sizeof(tmp));
1112                } while (u64_stats_fetch_retry_bh(&stats->syncp, start));
1113
1114                sum.tx_bytes   += tmp.tx_bytes;
1115                sum.tx_packets += tmp.tx_packets;
1116                sum.rx_bytes   += tmp.rx_bytes;
1117                sum.rx_packets += tmp.rx_packets;
1118        }
1119
1120        stats->tx_bytes   = sum.tx_bytes;
1121        stats->tx_packets = sum.tx_packets;
1122        stats->rx_bytes   = sum.rx_bytes;
1123        stats->rx_packets = sum.rx_packets;
1124
1125        stats->multicast = dev->stats.multicast;
1126        stats->rx_length_errors = dev->stats.rx_length_errors;
1127        stats->rx_frame_errors = dev->stats.rx_frame_errors;
1128        stats->rx_errors = dev->stats.rx_errors;
1129
1130        stats->tx_dropped = dev->stats.tx_dropped;
1131        stats->tx_carrier_errors  = dev->stats.tx_carrier_errors;
1132        stats->tx_aborted_errors  = dev->stats.tx_aborted_errors;
1133        stats->collisions  = dev->stats.collisions;
1134        stats->tx_errors = dev->stats.tx_errors;
1135
1136        return stats;
1137}
1138
1139/* Stub, nothing needs to be done. */
1140static void vxlan_set_multicast_list(struct net_device *dev)
1141{
1142}
1143
1144static const struct net_device_ops vxlan_netdev_ops = {
1145        .ndo_init               = vxlan_init,
1146        .ndo_open               = vxlan_open,
1147        .ndo_stop               = vxlan_stop,
1148        .ndo_start_xmit         = vxlan_xmit,
1149        .ndo_get_stats64        = vxlan_stats64,
1150        .ndo_set_rx_mode        = vxlan_set_multicast_list,
1151        .ndo_change_mtu         = eth_change_mtu,
1152        .ndo_validate_addr      = eth_validate_addr,
1153        .ndo_set_mac_address    = eth_mac_addr,
1154        .ndo_fdb_add            = vxlan_fdb_add,
1155        .ndo_fdb_del            = vxlan_fdb_delete,
1156        .ndo_fdb_dump           = vxlan_fdb_dump,
1157};
1158
1159/* Info for udev, that this is a virtual tunnel endpoint */
1160static struct device_type vxlan_type = {
1161        .name = "vxlan",
1162};
1163
1164static void vxlan_free(struct net_device *dev)
1165{
1166        struct vxlan_dev *vxlan = netdev_priv(dev);
1167
1168        free_percpu(vxlan->stats);
1169        free_netdev(dev);
1170}
1171
1172/* Initialize the device structure. */
1173static void vxlan_setup(struct net_device *dev)
1174{
1175        struct vxlan_dev *vxlan = netdev_priv(dev);
1176        unsigned h;
1177        int low, high;
1178
1179        eth_hw_addr_random(dev);
1180        ether_setup(dev);
1181        dev->hard_header_len = ETH_HLEN + VXLAN_HEADROOM;
1182
1183        dev->netdev_ops = &vxlan_netdev_ops;
1184        dev->destructor = vxlan_free;
1185        SET_NETDEV_DEVTYPE(dev, &vxlan_type);
1186
1187        dev->tx_queue_len = 0;
1188        dev->features   |= NETIF_F_LLTX;
1189        dev->features   |= NETIF_F_NETNS_LOCAL;
1190        dev->features   |= NETIF_F_SG | NETIF_F_HW_CSUM;
1191        dev->features   |= NETIF_F_RXCSUM;
1192
1193        dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
1194        dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
1195        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1196
1197        spin_lock_init(&vxlan->hash_lock);
1198
1199        init_timer_deferrable(&vxlan->age_timer);
1200        vxlan->age_timer.function = vxlan_cleanup;
1201        vxlan->age_timer.data = (unsigned long) vxlan;
1202
1203        inet_get_local_port_range(&low, &high);
1204        vxlan->port_min = low;
1205        vxlan->port_max = high;
1206
1207        vxlan->dev = dev;
1208
1209        for (h = 0; h < FDB_HASH_SIZE; ++h)
1210                INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
1211}
1212
1213static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
1214        [IFLA_VXLAN_ID]         = { .type = NLA_U32 },
1215        [IFLA_VXLAN_GROUP]      = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1216        [IFLA_VXLAN_LINK]       = { .type = NLA_U32 },
1217        [IFLA_VXLAN_LOCAL]      = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1218        [IFLA_VXLAN_TOS]        = { .type = NLA_U8 },
1219        [IFLA_VXLAN_TTL]        = { .type = NLA_U8 },
1220        [IFLA_VXLAN_LEARNING]   = { .type = NLA_U8 },
1221        [IFLA_VXLAN_AGEING]     = { .type = NLA_U32 },
1222        [IFLA_VXLAN_LIMIT]      = { .type = NLA_U32 },
1223        [IFLA_VXLAN_PORT_RANGE] = { .len  = sizeof(struct ifla_vxlan_port_range) },
1224        [IFLA_VXLAN_PROXY]      = { .type = NLA_U8 },
1225        [IFLA_VXLAN_RSC]        = { .type = NLA_U8 },
1226        [IFLA_VXLAN_L2MISS]     = { .type = NLA_U8 },
1227        [IFLA_VXLAN_L3MISS]     = { .type = NLA_U8 },
1228};
1229
1230static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
1231{
1232        if (tb[IFLA_ADDRESS]) {
1233                if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
1234                        pr_debug("invalid link address (not ethernet)\n");
1235                        return -EINVAL;
1236                }
1237
1238                if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
1239                        pr_debug("invalid all zero ethernet address\n");
1240                        return -EADDRNOTAVAIL;
1241                }
1242        }
1243
1244        if (!data)
1245                return -EINVAL;
1246
1247        if (data[IFLA_VXLAN_ID]) {
1248                __u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
1249                if (id >= VXLAN_VID_MASK)
1250                        return -ERANGE;
1251        }
1252
1253        if (data[IFLA_VXLAN_GROUP]) {
1254                __be32 gaddr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
1255                if (!IN_MULTICAST(ntohl(gaddr))) {
1256                        pr_debug("group address is not IPv4 multicast\n");
1257                        return -EADDRNOTAVAIL;
1258                }
1259        }
1260
1261        if (data[IFLA_VXLAN_PORT_RANGE]) {
1262                const struct ifla_vxlan_port_range *p
1263                        = nla_data(data[IFLA_VXLAN_PORT_RANGE]);
1264
1265                if (ntohs(p->high) < ntohs(p->low)) {
1266                        pr_debug("port range %u .. %u not valid\n",
1267                                 ntohs(p->low), ntohs(p->high));
1268                        return -EINVAL;
1269                }
1270        }
1271
1272        return 0;
1273}
1274
1275static void vxlan_get_drvinfo(struct net_device *netdev,
1276                              struct ethtool_drvinfo *drvinfo)
1277{
1278        strlcpy(drvinfo->version, VXLAN_VERSION, sizeof(drvinfo->version));
1279        strlcpy(drvinfo->driver, "vxlan", sizeof(drvinfo->driver));
1280}
1281
1282static const struct ethtool_ops vxlan_ethtool_ops = {
1283        .get_drvinfo    = vxlan_get_drvinfo,
1284        .get_link       = ethtool_op_get_link,
1285};
1286
1287static int vxlan_newlink(struct net *net, struct net_device *dev,
1288                         struct nlattr *tb[], struct nlattr *data[])
1289{
1290        struct vxlan_dev *vxlan = netdev_priv(dev);
1291        __u32 vni;
1292        int err;
1293
1294        if (!data[IFLA_VXLAN_ID])
1295                return -EINVAL;
1296
1297        vni = nla_get_u32(data[IFLA_VXLAN_ID]);
1298        if (vxlan_find_vni(net, vni)) {
1299                pr_info("duplicate VNI %u\n", vni);
1300                return -EEXIST;
1301        }
1302        vxlan->vni = vni;
1303
1304        if (data[IFLA_VXLAN_GROUP])
1305                vxlan->gaddr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
1306
1307        if (data[IFLA_VXLAN_LOCAL])
1308                vxlan->saddr = nla_get_be32(data[IFLA_VXLAN_LOCAL]);
1309
1310        if (data[IFLA_VXLAN_LINK] &&
1311            (vxlan->link = nla_get_u32(data[IFLA_VXLAN_LINK]))) {
1312                struct net_device *lowerdev
1313                         = __dev_get_by_index(net, vxlan->link);
1314
1315                if (!lowerdev) {
1316                        pr_info("ifindex %d does not exist\n", vxlan->link);
1317                        return -ENODEV;
1318                }
1319
1320                if (!tb[IFLA_MTU])
1321                        dev->mtu = lowerdev->mtu - VXLAN_HEADROOM;
1322
1323                /* update header length based on lower device */
1324                dev->hard_header_len = lowerdev->hard_header_len +
1325                                       VXLAN_HEADROOM;
1326        }
1327
1328        if (data[IFLA_VXLAN_TOS])
1329                vxlan->tos  = nla_get_u8(data[IFLA_VXLAN_TOS]);
1330
1331        if (data[IFLA_VXLAN_TTL])
1332                vxlan->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
1333
1334        if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING]))
1335                vxlan->flags |= VXLAN_F_LEARN;
1336
1337        if (data[IFLA_VXLAN_AGEING])
1338                vxlan->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]);
1339        else
1340                vxlan->age_interval = FDB_AGE_DEFAULT;
1341
1342        if (data[IFLA_VXLAN_PROXY] && nla_get_u8(data[IFLA_VXLAN_PROXY]))
1343                vxlan->flags |= VXLAN_F_PROXY;
1344
1345        if (data[IFLA_VXLAN_RSC] && nla_get_u8(data[IFLA_VXLAN_RSC]))
1346                vxlan->flags |= VXLAN_F_RSC;
1347
1348        if (data[IFLA_VXLAN_L2MISS] && nla_get_u8(data[IFLA_VXLAN_L2MISS]))
1349                vxlan->flags |= VXLAN_F_L2MISS;
1350
1351        if (data[IFLA_VXLAN_L3MISS] && nla_get_u8(data[IFLA_VXLAN_L3MISS]))
1352                vxlan->flags |= VXLAN_F_L3MISS;
1353
1354        if (data[IFLA_VXLAN_LIMIT])
1355                vxlan->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
1356
1357        if (data[IFLA_VXLAN_PORT_RANGE]) {
1358                const struct ifla_vxlan_port_range *p
1359                        = nla_data(data[IFLA_VXLAN_PORT_RANGE]);
1360                vxlan->port_min = ntohs(p->low);
1361                vxlan->port_max = ntohs(p->high);
1362        }
1363
1364        SET_ETHTOOL_OPS(dev, &vxlan_ethtool_ops);
1365
1366        err = register_netdevice(dev);
1367        if (!err)
1368                hlist_add_head_rcu(&vxlan->hlist, vni_head(net, vxlan->vni));
1369
1370        return err;
1371}
1372
1373static void vxlan_dellink(struct net_device *dev, struct list_head *head)
1374{
1375        struct vxlan_dev *vxlan = netdev_priv(dev);
1376
1377        hlist_del_rcu(&vxlan->hlist);
1378
1379        unregister_netdevice_queue(dev, head);
1380}
1381
1382static size_t vxlan_get_size(const struct net_device *dev)
1383{
1384
1385        return nla_total_size(sizeof(__u32)) +  /* IFLA_VXLAN_ID */
1386                nla_total_size(sizeof(__be32)) +/* IFLA_VXLAN_GROUP */
1387                nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */
1388                nla_total_size(sizeof(__be32))+ /* IFLA_VXLAN_LOCAL */
1389                nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TTL */
1390                nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TOS */
1391                nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_LEARNING */
1392                nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_PROXY */
1393                nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_RSC */
1394                nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_L2MISS */
1395                nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_L3MISS */
1396                nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */
1397                nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */
1398                nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
1399                0;
1400}
1401
1402static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
1403{
1404        const struct vxlan_dev *vxlan = netdev_priv(dev);
1405        struct ifla_vxlan_port_range ports = {
1406                .low =  htons(vxlan->port_min),
1407                .high = htons(vxlan->port_max),
1408        };
1409
1410        if (nla_put_u32(skb, IFLA_VXLAN_ID, vxlan->vni))
1411                goto nla_put_failure;
1412
1413        if (vxlan->gaddr && nla_put_be32(skb, IFLA_VXLAN_GROUP, vxlan->gaddr))
1414                goto nla_put_failure;
1415
1416        if (vxlan->link && nla_put_u32(skb, IFLA_VXLAN_LINK, vxlan->link))
1417                goto nla_put_failure;
1418
1419        if (vxlan->saddr && nla_put_be32(skb, IFLA_VXLAN_LOCAL, vxlan->saddr))
1420                goto nla_put_failure;
1421
1422        if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) ||
1423            nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) ||
1424            nla_put_u8(skb, IFLA_VXLAN_LEARNING,
1425                        !!(vxlan->flags & VXLAN_F_LEARN)) ||
1426            nla_put_u8(skb, IFLA_VXLAN_PROXY,
1427                        !!(vxlan->flags & VXLAN_F_PROXY)) ||
1428            nla_put_u8(skb, IFLA_VXLAN_RSC, !!(vxlan->flags & VXLAN_F_RSC)) ||
1429            nla_put_u8(skb, IFLA_VXLAN_L2MISS,
1430                        !!(vxlan->flags & VXLAN_F_L2MISS)) ||
1431            nla_put_u8(skb, IFLA_VXLAN_L3MISS,
1432                        !!(vxlan->flags & VXLAN_F_L3MISS)) ||
1433            nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->age_interval) ||
1434            nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->addrmax))
1435                goto nla_put_failure;
1436
1437        if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports))
1438                goto nla_put_failure;
1439
1440        return 0;
1441
1442nla_put_failure:
1443        return -EMSGSIZE;
1444}
1445
1446static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
1447        .kind           = "vxlan",
1448        .maxtype        = IFLA_VXLAN_MAX,
1449        .policy         = vxlan_policy,
1450        .priv_size      = sizeof(struct vxlan_dev),
1451        .setup          = vxlan_setup,
1452        .validate       = vxlan_validate,
1453        .newlink        = vxlan_newlink,
1454        .dellink        = vxlan_dellink,
1455        .get_size       = vxlan_get_size,
1456        .fill_info      = vxlan_fill_info,
1457};
1458
1459static __net_init int vxlan_init_net(struct net *net)
1460{
1461        struct vxlan_net *vn = net_generic(net, vxlan_net_id);
1462        struct sock *sk;
1463        struct sockaddr_in vxlan_addr = {
1464                .sin_family = AF_INET,
1465                .sin_addr.s_addr = htonl(INADDR_ANY),
1466        };
1467        int rc;
1468        unsigned h;
1469
1470        /* Create UDP socket for encapsulation receive. */
1471        rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &vn->sock);
1472        if (rc < 0) {
1473                pr_debug("UDP socket create failed\n");
1474                return rc;
1475        }
1476        /* Put in proper namespace */
1477        sk = vn->sock->sk;
1478        sk_change_net(sk, net);
1479
1480        vxlan_addr.sin_port = htons(vxlan_port);
1481
1482        rc = kernel_bind(vn->sock, (struct sockaddr *) &vxlan_addr,
1483                         sizeof(vxlan_addr));
1484        if (rc < 0) {
1485                pr_debug("bind for UDP socket %pI4:%u (%d)\n",
1486                         &vxlan_addr.sin_addr, ntohs(vxlan_addr.sin_port), rc);
1487                sk_release_kernel(sk);
1488                vn->sock = NULL;
1489                return rc;
1490        }
1491
1492        /* Disable multicast loopback */
1493        inet_sk(sk)->mc_loop = 0;
1494
1495        /* Mark socket as an encapsulation socket. */
1496        udp_sk(sk)->encap_type = 1;
1497        udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv;
1498        udp_encap_enable();
1499
1500        for (h = 0; h < VNI_HASH_SIZE; ++h)
1501                INIT_HLIST_HEAD(&vn->vni_list[h]);
1502
1503        return 0;
1504}
1505
1506static __net_exit void vxlan_exit_net(struct net *net)
1507{
1508        struct vxlan_net *vn = net_generic(net, vxlan_net_id);
1509        struct vxlan_dev *vxlan;
1510        unsigned h;
1511
1512        rtnl_lock();
1513        for (h = 0; h < VNI_HASH_SIZE; ++h)
1514                hlist_for_each_entry(vxlan, &vn->vni_list[h], hlist)
1515                        dev_close(vxlan->dev);
1516        rtnl_unlock();
1517
1518        if (vn->sock) {
1519                sk_release_kernel(vn->sock->sk);
1520                vn->sock = NULL;
1521        }
1522}
1523
1524static struct pernet_operations vxlan_net_ops = {
1525        .init = vxlan_init_net,
1526        .exit = vxlan_exit_net,
1527        .id   = &vxlan_net_id,
1528        .size = sizeof(struct vxlan_net),
1529};
1530
1531static int __init vxlan_init_module(void)
1532{
1533        int rc;
1534
1535        get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));
1536
1537        rc = register_pernet_device(&vxlan_net_ops);
1538        if (rc)
1539                goto out1;
1540
1541        rc = rtnl_link_register(&vxlan_link_ops);
1542        if (rc)
1543                goto out2;
1544
1545        return 0;
1546
1547out2:
1548        unregister_pernet_device(&vxlan_net_ops);
1549out1:
1550        return rc;
1551}
1552module_init(vxlan_init_module);
1553
1554static void __exit vxlan_cleanup_module(void)
1555{
1556        rtnl_link_unregister(&vxlan_link_ops);
1557        unregister_pernet_device(&vxlan_net_ops);
1558}
1559module_exit(vxlan_cleanup_module);
1560
1561MODULE_LICENSE("GPL");
1562MODULE_VERSION(VXLAN_VERSION);
1563MODULE_AUTHOR("Stephen Hemminger <shemminger@vyatta.com>");
1564MODULE_ALIAS_RTNL_LINK("vxlan");
1565
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.