linux/net/ipv4/ip_gre.c
<<
>>
Prefs
   1/*
   2 *      Linux NET3:     GRE over IP protocol decoder.
   3 *
   4 *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
   5 *
   6 *      This program is free software; you can redistribute it and/or
   7 *      modify it under the terms of the GNU General Public License
   8 *      as published by the Free Software Foundation; either version
   9 *      2 of the License, or (at your option) any later version.
  10 *
  11 */
  12
  13#include <linux/capability.h>
  14#include <linux/module.h>
  15#include <linux/types.h>
  16#include <linux/kernel.h>
  17#include <asm/uaccess.h>
  18#include <linux/skbuff.h>
  19#include <linux/netdevice.h>
  20#include <linux/in.h>
  21#include <linux/tcp.h>
  22#include <linux/udp.h>
  23#include <linux/if_arp.h>
  24#include <linux/mroute.h>
  25#include <linux/init.h>
  26#include <linux/in6.h>
  27#include <linux/inetdevice.h>
  28#include <linux/igmp.h>
  29#include <linux/netfilter_ipv4.h>
  30#include <linux/etherdevice.h>
  31#include <linux/if_ether.h>
  32
  33#include <net/sock.h>
  34#include <net/ip.h>
  35#include <net/icmp.h>
  36#include <net/protocol.h>
  37#include <net/ipip.h>
  38#include <net/arp.h>
  39#include <net/checksum.h>
  40#include <net/dsfield.h>
  41#include <net/inet_ecn.h>
  42#include <net/xfrm.h>
  43#include <net/net_namespace.h>
  44#include <net/netns/generic.h>
  45#include <net/rtnetlink.h>
  46
  47#ifdef CONFIG_IPV6
  48#include <net/ipv6.h>
  49#include <net/ip6_fib.h>
  50#include <net/ip6_route.h>
  51#endif
  52
  53/*
  54   Problems & solutions
  55   --------------------
  56
  57   1. The most important issue is detecting local dead loops.
  58   They would cause complete host lockup in transmit, which
  59   would be "resolved" by stack overflow or, if queueing is enabled,
  60   with infinite looping in net_bh.
  61
  62   We cannot track such dead loops during route installation,
  63   it is infeasible task. The most general solutions would be
  64   to keep skb->encapsulation counter (sort of local ttl),
  65   and silently drop packet when it expires. It is the best
  66   solution, but it supposes maintaing new variable in ALL
  67   skb, even if no tunneling is used.
  68
  69   Current solution: HARD_TX_LOCK lock breaks dead loops.
  70
  71
  72
  73   2. Networking dead loops would not kill routers, but would really
  74   kill network. IP hop limit plays role of "t->recursion" in this case,
  75   if we copy it from packet being encapsulated to upper header.
  76   It is very good solution, but it introduces two problems:
  77
  78   - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
  79     do not work over tunnels.
  80   - traceroute does not work. I planned to relay ICMP from tunnel,
  81     so that this problem would be solved and traceroute output
  82     would even more informative. This idea appeared to be wrong:
  83     only Linux complies to rfc1812 now (yes, guys, Linux is the only
  84     true router now :-)), all routers (at least, in neighbourhood of mine)
  85     return only 8 bytes of payload. It is the end.
  86
  87   Hence, if we want that OSPF worked or traceroute said something reasonable,
  88   we should search for another solution.
  89
  90   One of them is to parse packet trying to detect inner encapsulation
  91   made by our node. It is difficult or even impossible, especially,
  92   taking into account fragmentation. TO be short, tt is not solution at all.
  93
  94   Current solution: The solution was UNEXPECTEDLY SIMPLE.
  95   We force DF flag on tunnels with preconfigured hop limit,
  96   that is ALL. :-) Well, it does not remove the problem completely,
  97   but exponential growth of network traffic is changed to linear
  98   (branches, that exceed pmtu are pruned) and tunnel mtu
  99   fastly degrades to value <68, where looping stops.
 100   Yes, it is not good if there exists a router in the loop,
 101   which does not force DF, even when encapsulating packets have DF set.
 102   But it is not our problem! Nobody could accuse us, we made
 103   all that we could make. Even if it is your gated who injected
 104   fatal route to network, even if it were you who configured
 105   fatal static route: you are innocent. :-)
 106
 107
 108
 109   3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
 110   practically identical code. It would be good to glue them
 111   together, but it is not very evident, how to make them modular.
 112   sit is integral part of IPv6, ipip and gre are naturally modular.
 113   We could extract common parts (hash table, ioctl etc)
 114   to a separate module (ip_tunnel.c).
 115
 116   Alexey Kuznetsov.
 117 */
 118
 119static struct rtnl_link_ops ipgre_link_ops __read_mostly;
 120static int ipgre_tunnel_init(struct net_device *dev);
 121static void ipgre_tunnel_setup(struct net_device *dev);
 122static int ipgre_tunnel_bind_dev(struct net_device *dev);
 123
 124/* Fallback tunnel: no source, no destination, no key, no options */
 125
 126#define HASH_SIZE  16
 127
 128static int ipgre_net_id __read_mostly;
 129struct ipgre_net {
 130        struct ip_tunnel *tunnels[4][HASH_SIZE];
 131
 132        struct net_device *fb_tunnel_dev;
 133};
 134
 135/* Tunnel hash table */
 136
 137/*
 138   4 hash tables:
 139
 140   3: (remote,local)
 141   2: (remote,*)
 142   1: (*,local)
 143   0: (*,*)
 144
 145   We require exact key match i.e. if a key is present in packet
 146   it will match only tunnel with the same key; if it is not present,
 147   it will match only keyless tunnel.
 148
 149   All keysless packets, if not matched configured keyless tunnels
 150   will match fallback tunnel.
 151 */
 152
 153#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
 154
 155#define tunnels_r_l     tunnels[3]
 156#define tunnels_r       tunnels[2]
 157#define tunnels_l       tunnels[1]
 158#define tunnels_wc      tunnels[0]
 159/*
 160 * Locking : hash tables are protected by RCU and a spinlock
 161 */
 162static DEFINE_SPINLOCK(ipgre_lock);
 163
 164#define for_each_ip_tunnel_rcu(start) \
 165        for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
 166
 167/* Given src, dst and key, find appropriate for input tunnel. */
 168
 169static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
 170                                              __be32 remote, __be32 local,
 171                                              __be32 key, __be16 gre_proto)
 172{
 173        struct net *net = dev_net(dev);
 174        int link = dev->ifindex;
 175        unsigned h0 = HASH(remote);
 176        unsigned h1 = HASH(key);
 177        struct ip_tunnel *t, *cand = NULL;
 178        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 179        int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
 180                       ARPHRD_ETHER : ARPHRD_IPGRE;
 181        int score, cand_score = 4;
 182
 183        for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
 184                if (local != t->parms.iph.saddr ||
 185                    remote != t->parms.iph.daddr ||
 186                    key != t->parms.i_key ||
 187                    !(t->dev->flags & IFF_UP))
 188                        continue;
 189
 190                if (t->dev->type != ARPHRD_IPGRE &&
 191                    t->dev->type != dev_type)
 192                        continue;
 193
 194                score = 0;
 195                if (t->parms.link != link)
 196                        score |= 1;
 197                if (t->dev->type != dev_type)
 198                        score |= 2;
 199                if (score == 0)
 200                        return t;
 201
 202                if (score < cand_score) {
 203                        cand = t;
 204                        cand_score = score;
 205                }
 206        }
 207
 208        for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
 209                if (remote != t->parms.iph.daddr ||
 210                    key != t->parms.i_key ||
 211                    !(t->dev->flags & IFF_UP))
 212                        continue;
 213
 214                if (t->dev->type != ARPHRD_IPGRE &&
 215                    t->dev->type != dev_type)
 216                        continue;
 217
 218                score = 0;
 219                if (t->parms.link != link)
 220                        score |= 1;
 221                if (t->dev->type != dev_type)
 222                        score |= 2;
 223                if (score == 0)
 224                        return t;
 225
 226                if (score < cand_score) {
 227                        cand = t;
 228                        cand_score = score;
 229                }
 230        }
 231
 232        for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
 233                if ((local != t->parms.iph.saddr &&
 234                     (local != t->parms.iph.daddr ||
 235                      !ipv4_is_multicast(local))) ||
 236                    key != t->parms.i_key ||
 237                    !(t->dev->flags & IFF_UP))
 238                        continue;
 239
 240                if (t->dev->type != ARPHRD_IPGRE &&
 241                    t->dev->type != dev_type)
 242                        continue;
 243
 244                score = 0;
 245                if (t->parms.link != link)
 246                        score |= 1;
 247                if (t->dev->type != dev_type)
 248                        score |= 2;
 249                if (score == 0)
 250                        return t;
 251
 252                if (score < cand_score) {
 253                        cand = t;
 254                        cand_score = score;
 255                }
 256        }
 257
 258        for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
 259                if (t->parms.i_key != key ||
 260                    !(t->dev->flags & IFF_UP))
 261                        continue;
 262
 263                if (t->dev->type != ARPHRD_IPGRE &&
 264                    t->dev->type != dev_type)
 265                        continue;
 266
 267                score = 0;
 268                if (t->parms.link != link)
 269                        score |= 1;
 270                if (t->dev->type != dev_type)
 271                        score |= 2;
 272                if (score == 0)
 273                        return t;
 274
 275                if (score < cand_score) {
 276                        cand = t;
 277                        cand_score = score;
 278                }
 279        }
 280
 281        if (cand != NULL)
 282                return cand;
 283
 284        dev = ign->fb_tunnel_dev;
 285        if (dev->flags & IFF_UP)
 286                return netdev_priv(dev);
 287
 288        return NULL;
 289}
 290
 291static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
 292                struct ip_tunnel_parm *parms)
 293{
 294        __be32 remote = parms->iph.daddr;
 295        __be32 local = parms->iph.saddr;
 296        __be32 key = parms->i_key;
 297        unsigned h = HASH(key);
 298        int prio = 0;
 299
 300        if (local)
 301                prio |= 1;
 302        if (remote && !ipv4_is_multicast(remote)) {
 303                prio |= 2;
 304                h ^= HASH(remote);
 305        }
 306
 307        return &ign->tunnels[prio][h];
 308}
 309
 310static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
 311                struct ip_tunnel *t)
 312{
 313        return __ipgre_bucket(ign, &t->parms);
 314}
 315
 316static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
 317{
 318        struct ip_tunnel **tp = ipgre_bucket(ign, t);
 319
 320        spin_lock_bh(&ipgre_lock);
 321        t->next = *tp;
 322        rcu_assign_pointer(*tp, t);
 323        spin_unlock_bh(&ipgre_lock);
 324}
 325
 326static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
 327{
 328        struct ip_tunnel **tp;
 329
 330        for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
 331                if (t == *tp) {
 332                        spin_lock_bh(&ipgre_lock);
 333                        *tp = t->next;
 334                        spin_unlock_bh(&ipgre_lock);
 335                        break;
 336                }
 337        }
 338}
 339
 340static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
 341                                           struct ip_tunnel_parm *parms,
 342                                           int type)
 343{
 344        __be32 remote = parms->iph.daddr;
 345        __be32 local = parms->iph.saddr;
 346        __be32 key = parms->i_key;
 347        int link = parms->link;
 348        struct ip_tunnel *t, **tp;
 349        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 350
 351        for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
 352                if (local == t->parms.iph.saddr &&
 353                    remote == t->parms.iph.daddr &&
 354                    key == t->parms.i_key &&
 355                    link == t->parms.link &&
 356                    type == t->dev->type)
 357                        break;
 358
 359        return t;
 360}
 361
 362static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
 363                struct ip_tunnel_parm *parms, int create)
 364{
 365        struct ip_tunnel *t, *nt;
 366        struct net_device *dev;
 367        char name[IFNAMSIZ];
 368        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 369
 370        t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
 371        if (t || !create)
 372                return t;
 373
 374        if (parms->name[0])
 375                strlcpy(name, parms->name, IFNAMSIZ);
 376        else
 377                sprintf(name, "gre%%d");
 378
 379        dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
 380        if (!dev)
 381          return NULL;
 382
 383        dev_net_set(dev, net);
 384
 385        if (strchr(name, '%')) {
 386                if (dev_alloc_name(dev, name) < 0)
 387                        goto failed_free;
 388        }
 389
 390        nt = netdev_priv(dev);
 391        nt->parms = *parms;
 392        dev->rtnl_link_ops = &ipgre_link_ops;
 393
 394        dev->mtu = ipgre_tunnel_bind_dev(dev);
 395
 396        if (register_netdevice(dev) < 0)
 397                goto failed_free;
 398
 399        dev_hold(dev);
 400        ipgre_tunnel_link(ign, nt);
 401        return nt;
 402
 403failed_free:
 404        free_netdev(dev);
 405        return NULL;
 406}
 407
 408static void ipgre_tunnel_uninit(struct net_device *dev)
 409{
 410        struct net *net = dev_net(dev);
 411        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 412
 413        ipgre_tunnel_unlink(ign, netdev_priv(dev));
 414        dev_put(dev);
 415}
 416
 417
 418static void ipgre_err(struct sk_buff *skb, u32 info)
 419{
 420
 421/* All the routers (except for Linux) return only
 422   8 bytes of packet payload. It means, that precise relaying of
 423   ICMP in the real Internet is absolutely infeasible.
 424
 425   Moreover, Cisco "wise men" put GRE key to the third word
 426   in GRE header. It makes impossible maintaining even soft state for keyed
 427   GRE tunnels with enabled checksum. Tell them "thank you".
 428
 429   Well, I wonder, rfc1812 was written by Cisco employee,
 430   what the hell these idiots break standrads established
 431   by themself???
 432 */
 433
 434        struct iphdr *iph = (struct iphdr *)skb->data;
 435        __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
 436        int grehlen = (iph->ihl<<2) + 4;
 437        const int type = icmp_hdr(skb)->type;
 438        const int code = icmp_hdr(skb)->code;
 439        struct ip_tunnel *t;
 440        __be16 flags;
 441
 442        flags = p[0];
 443        if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
 444                if (flags&(GRE_VERSION|GRE_ROUTING))
 445                        return;
 446                if (flags&GRE_KEY) {
 447                        grehlen += 4;
 448                        if (flags&GRE_CSUM)
 449                                grehlen += 4;
 450                }
 451        }
 452
 453        /* If only 8 bytes returned, keyed message will be dropped here */
 454        if (skb_headlen(skb) < grehlen)
 455                return;
 456
 457        switch (type) {
 458        default:
 459        case ICMP_PARAMETERPROB:
 460                return;
 461
 462        case ICMP_DEST_UNREACH:
 463                switch (code) {
 464                case ICMP_SR_FAILED:
 465                case ICMP_PORT_UNREACH:
 466                        /* Impossible event. */
 467                        return;
 468                case ICMP_FRAG_NEEDED:
 469                        /* Soft state for pmtu is maintained by IP core. */
 470                        return;
 471                default:
 472                        /* All others are translated to HOST_UNREACH.
 473                           rfc2003 contains "deep thoughts" about NET_UNREACH,
 474                           I believe they are just ether pollution. --ANK
 475                         */
 476                        break;
 477                }
 478                break;
 479        case ICMP_TIME_EXCEEDED:
 480                if (code != ICMP_EXC_TTL)
 481                        return;
 482                break;
 483        }
 484
 485        rcu_read_lock();
 486        t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
 487                                flags & GRE_KEY ?
 488                                *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
 489                                p[1]);
 490        if (t == NULL || t->parms.iph.daddr == 0 ||
 491            ipv4_is_multicast(t->parms.iph.daddr))
 492                goto out;
 493
 494        if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
 495                goto out;
 496
 497        if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
 498                t->err_count++;
 499        else
 500                t->err_count = 1;
 501        t->err_time = jiffies;
 502out:
 503        rcu_read_unlock();
 504        return;
 505}
 506
 507static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
 508{
 509        if (INET_ECN_is_ce(iph->tos)) {
 510                if (skb->protocol == htons(ETH_P_IP)) {
 511                        IP_ECN_set_ce(ip_hdr(skb));
 512                } else if (skb->protocol == htons(ETH_P_IPV6)) {
 513                        IP6_ECN_set_ce(ipv6_hdr(skb));
 514                }
 515        }
 516}
 517
 518static inline u8
 519ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
 520{
 521        u8 inner = 0;
 522        if (skb->protocol == htons(ETH_P_IP))
 523                inner = old_iph->tos;
 524        else if (skb->protocol == htons(ETH_P_IPV6))
 525                inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
 526        return INET_ECN_encapsulate(tos, inner);
 527}
 528
 529static int ipgre_rcv(struct sk_buff *skb)
 530{
 531        struct iphdr *iph;
 532        u8     *h;
 533        __be16    flags;
 534        __sum16   csum = 0;
 535        __be32 key = 0;
 536        u32    seqno = 0;
 537        struct ip_tunnel *tunnel;
 538        int    offset = 4;
 539        __be16 gre_proto;
 540        unsigned int len;
 541
 542        if (!pskb_may_pull(skb, 16))
 543                goto drop_nolock;
 544
 545        iph = ip_hdr(skb);
 546        h = skb->data;
 547        flags = *(__be16*)h;
 548
 549        if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
 550                /* - Version must be 0.
 551                   - We do not support routing headers.
 552                 */
 553                if (flags&(GRE_VERSION|GRE_ROUTING))
 554                        goto drop_nolock;
 555
 556                if (flags&GRE_CSUM) {
 557                        switch (skb->ip_summed) {
 558                        case CHECKSUM_COMPLETE:
 559                                csum = csum_fold(skb->csum);
 560                                if (!csum)
 561                                        break;
 562                                /* fall through */
 563                        case CHECKSUM_NONE:
 564                                skb->csum = 0;
 565                                csum = __skb_checksum_complete(skb);
 566                                skb->ip_summed = CHECKSUM_COMPLETE;
 567                        }
 568                        offset += 4;
 569                }
 570                if (flags&GRE_KEY) {
 571                        key = *(__be32*)(h + offset);
 572                        offset += 4;
 573                }
 574                if (flags&GRE_SEQ) {
 575                        seqno = ntohl(*(__be32*)(h + offset));
 576                        offset += 4;
 577                }
 578        }
 579
 580        gre_proto = *(__be16 *)(h + 2);
 581
 582        rcu_read_lock();
 583        if ((tunnel = ipgre_tunnel_lookup(skb->dev,
 584                                          iph->saddr, iph->daddr, key,
 585                                          gre_proto))) {
 586                struct net_device_stats *stats = &tunnel->dev->stats;
 587
 588                secpath_reset(skb);
 589
 590                skb->protocol = gre_proto;
 591                /* WCCP version 1 and 2 protocol decoding.
 592                 * - Change protocol to IP
 593                 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
 594                 */
 595                if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
 596                        skb->protocol = htons(ETH_P_IP);
 597                        if ((*(h + offset) & 0xF0) != 0x40)
 598                                offset += 4;
 599                }
 600
 601                skb->mac_header = skb->network_header;
 602                __pskb_pull(skb, offset);
 603                skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
 604                skb->pkt_type = PACKET_HOST;
 605#ifdef CONFIG_NET_IPGRE_BROADCAST
 606                if (ipv4_is_multicast(iph->daddr)) {
 607                        /* Looped back packet, drop it! */
 608                        if (skb_rtable(skb)->fl.iif == 0)
 609                                goto drop;
 610                        stats->multicast++;
 611                        skb->pkt_type = PACKET_BROADCAST;
 612                }
 613#endif
 614
 615                if (((flags&GRE_CSUM) && csum) ||
 616                    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
 617                        stats->rx_crc_errors++;
 618                        stats->rx_errors++;
 619                        goto drop;
 620                }
 621                if (tunnel->parms.i_flags&GRE_SEQ) {
 622                        if (!(flags&GRE_SEQ) ||
 623                            (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
 624                                stats->rx_fifo_errors++;
 625                                stats->rx_errors++;
 626                                goto drop;
 627                        }
 628                        tunnel->i_seqno = seqno + 1;
 629                }
 630
 631                len = skb->len;
 632
 633                /* Warning: All skb pointers will be invalidated! */
 634                if (tunnel->dev->type == ARPHRD_ETHER) {
 635                        if (!pskb_may_pull(skb, ETH_HLEN)) {
 636                                stats->rx_length_errors++;
 637                                stats->rx_errors++;
 638                                goto drop;
 639                        }
 640
 641                        iph = ip_hdr(skb);
 642                        skb->protocol = eth_type_trans(skb, tunnel->dev);
 643                        skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 644                }
 645
 646                stats->rx_packets++;
 647                stats->rx_bytes += len;
 648                skb->dev = tunnel->dev;
 649                skb_dst_drop(skb);
 650                nf_reset(skb);
 651
 652                skb_reset_network_header(skb);
 653                ipgre_ecn_decapsulate(iph, skb);
 654
 655                netif_rx(skb);
 656                rcu_read_unlock();
 657                return(0);
 658        }
 659        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 660
 661drop:
 662        rcu_read_unlock();
 663drop_nolock:
 664        kfree_skb(skb);
 665        return(0);
 666}
 667
 668static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 669{
 670        struct ip_tunnel *tunnel = netdev_priv(dev);
 671        struct net_device_stats *stats = &dev->stats;
 672        struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
 673        struct iphdr  *old_iph = ip_hdr(skb);
 674        struct iphdr  *tiph;
 675        u8     tos;
 676        __be16 df;
 677        struct rtable *rt;                      /* Route to the other host */
 678        struct net_device *tdev;                        /* Device to other host */
 679        struct iphdr  *iph;                     /* Our new IP header */
 680        unsigned int max_headroom;              /* The extra header space needed */
 681        int    gre_hlen;
 682        __be32 dst;
 683        int    mtu;
 684
 685        if (dev->type == ARPHRD_ETHER)
 686                IPCB(skb)->flags = 0;
 687
 688        if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
 689                gre_hlen = 0;
 690                tiph = (struct iphdr *)skb->data;
 691        } else {
 692                gre_hlen = tunnel->hlen;
 693                tiph = &tunnel->parms.iph;
 694        }
 695
 696        if ((dst = tiph->daddr) == 0) {
 697                /* NBMA tunnel */
 698
 699                if (skb_dst(skb) == NULL) {
 700                        stats->tx_fifo_errors++;
 701                        goto tx_error;
 702                }
 703
 704                if (skb->protocol == htons(ETH_P_IP)) {
 705                        rt = skb_rtable(skb);
 706                        if ((dst = rt->rt_gateway) == 0)
 707                                goto tx_error_icmp;
 708                }
 709#ifdef CONFIG_IPV6
 710                else if (skb->protocol == htons(ETH_P_IPV6)) {
 711                        struct in6_addr *addr6;
 712                        int addr_type;
 713                        struct neighbour *neigh = skb_dst(skb)->neighbour;
 714
 715                        if (neigh == NULL)
 716                                goto tx_error;
 717
 718                        addr6 = (struct in6_addr *)&neigh->primary_key;
 719                        addr_type = ipv6_addr_type(addr6);
 720
 721                        if (addr_type == IPV6_ADDR_ANY) {
 722                                addr6 = &ipv6_hdr(skb)->daddr;
 723                                addr_type = ipv6_addr_type(addr6);
 724                        }
 725
 726                        if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
 727                                goto tx_error_icmp;
 728
 729                        dst = addr6->s6_addr32[3];
 730                }
 731#endif
 732                else
 733                        goto tx_error;
 734        }
 735
 736        tos = tiph->tos;
 737        if (tos == 1) {
 738                tos = 0;
 739                if (skb->protocol == htons(ETH_P_IP))
 740                        tos = old_iph->tos;
 741        }
 742
 743        {
 744                struct flowi fl = { .oif = tunnel->parms.link,
 745                                    .nl_u = { .ip4_u =
 746                                              { .daddr = dst,
 747                                                .saddr = tiph->saddr,
 748                                                .tos = RT_TOS(tos) } },
 749                                    .proto = IPPROTO_GRE };
 750                if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
 751                        stats->tx_carrier_errors++;
 752                        goto tx_error;
 753                }
 754        }
 755        tdev = rt->u.dst.dev;
 756
 757        if (tdev == dev) {
 758                ip_rt_put(rt);
 759                stats->collisions++;
 760                goto tx_error;
 761        }
 762
 763        df = tiph->frag_off;
 764        if (df)
 765                mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
 766        else
 767                mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
 768
 769        if (skb_dst(skb))
 770                skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
 771
 772        if (skb->protocol == htons(ETH_P_IP)) {
 773                df |= (old_iph->frag_off&htons(IP_DF));
 774
 775                if ((old_iph->frag_off&htons(IP_DF)) &&
 776                    mtu < ntohs(old_iph->tot_len)) {
 777                        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
 778                        ip_rt_put(rt);
 779                        goto tx_error;
 780                }
 781        }
 782#ifdef CONFIG_IPV6
 783        else if (skb->protocol == htons(ETH_P_IPV6)) {
 784                struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
 785
 786                if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
 787                        if ((tunnel->parms.iph.daddr &&
 788                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
 789                            rt6->rt6i_dst.plen == 128) {
 790                                rt6->rt6i_flags |= RTF_MODIFIED;
 791                                skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
 792                        }
 793                }
 794
 795                if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
 796                        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
 797                        ip_rt_put(rt);
 798                        goto tx_error;
 799                }
 800        }
 801#endif
 802
 803        if (tunnel->err_count > 0) {
 804                if (time_before(jiffies,
 805                                tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
 806                        tunnel->err_count--;
 807
 808                        dst_link_failure(skb);
 809                } else
 810                        tunnel->err_count = 0;
 811        }
 812
 813        max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
 814
 815        if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
 816            (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
 817                struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
 818                if (!new_skb) {
 819                        ip_rt_put(rt);
 820                        txq->tx_dropped++;
 821                        dev_kfree_skb(skb);
 822                        return NETDEV_TX_OK;
 823                }
 824                if (skb->sk)
 825                        skb_set_owner_w(new_skb, skb->sk);
 826                dev_kfree_skb(skb);
 827                skb = new_skb;
 828                old_iph = ip_hdr(skb);
 829        }
 830
 831        skb_reset_transport_header(skb);
 832        skb_push(skb, gre_hlen);
 833        skb_reset_network_header(skb);
 834        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 835        IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
 836                              IPSKB_REROUTED);
 837        skb_dst_drop(skb);
 838        skb_dst_set(skb, &rt->u.dst);
 839
 840        /*
 841         *      Push down and install the IPIP header.
 842         */
 843
 844        iph                     =       ip_hdr(skb);
 845        iph->version            =       4;
 846        iph->ihl                =       sizeof(struct iphdr) >> 2;
 847        iph->frag_off           =       df;
 848        iph->protocol           =       IPPROTO_GRE;
 849        iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
 850        iph->daddr              =       rt->rt_dst;
 851        iph->saddr              =       rt->rt_src;
 852
 853        if ((iph->ttl = tiph->ttl) == 0) {
 854                if (skb->protocol == htons(ETH_P_IP))
 855                        iph->ttl = old_iph->ttl;
 856#ifdef CONFIG_IPV6
 857                else if (skb->protocol == htons(ETH_P_IPV6))
 858                        iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
 859#endif
 860                else
 861                        iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
 862        }
 863
 864        ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
 865        ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
 866                                   htons(ETH_P_TEB) : skb->protocol;
 867
 868        if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
 869                __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
 870
 871                if (tunnel->parms.o_flags&GRE_SEQ) {
 872                        ++tunnel->o_seqno;
 873                        *ptr = htonl(tunnel->o_seqno);
 874                        ptr--;
 875                }
 876                if (tunnel->parms.o_flags&GRE_KEY) {
 877                        *ptr = tunnel->parms.o_key;
 878                        ptr--;
 879                }
 880                if (tunnel->parms.o_flags&GRE_CSUM) {
 881                        *ptr = 0;
 882                        *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
 883                }
 884        }
 885
 886        nf_reset(skb);
 887
 888        IPTUNNEL_XMIT();
 889        return NETDEV_TX_OK;
 890
 891tx_error_icmp:
 892        dst_link_failure(skb);
 893
 894tx_error:
 895        stats->tx_errors++;
 896        dev_kfree_skb(skb);
 897        return NETDEV_TX_OK;
 898}
 899
 900static int ipgre_tunnel_bind_dev(struct net_device *dev)
 901{
 902        struct net_device *tdev = NULL;
 903        struct ip_tunnel *tunnel;
 904        struct iphdr *iph;
 905        int hlen = LL_MAX_HEADER;
 906        int mtu = ETH_DATA_LEN;
 907        int addend = sizeof(struct iphdr) + 4;
 908
 909        tunnel = netdev_priv(dev);
 910        iph = &tunnel->parms.iph;
 911
 912        /* Guess output device to choose reasonable mtu and needed_headroom */
 913
 914        if (iph->daddr) {
 915                struct flowi fl = { .oif = tunnel->parms.link,
 916                                    .nl_u = { .ip4_u =
 917                                              { .daddr = iph->daddr,
 918                                                .saddr = iph->saddr,
 919                                                .tos = RT_TOS(iph->tos) } },
 920                                    .proto = IPPROTO_GRE };
 921                struct rtable *rt;
 922                if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
 923                        tdev = rt->u.dst.dev;
 924                        ip_rt_put(rt);
 925                }
 926
 927                if (dev->type != ARPHRD_ETHER)
 928                        dev->flags |= IFF_POINTOPOINT;
 929        }
 930
 931        if (!tdev && tunnel->parms.link)
 932                tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
 933
 934        if (tdev) {
 935                hlen = tdev->hard_header_len + tdev->needed_headroom;
 936                mtu = tdev->mtu;
 937        }
 938        dev->iflink = tunnel->parms.link;
 939
 940        /* Precalculate GRE options length */
 941        if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
 942                if (tunnel->parms.o_flags&GRE_CSUM)
 943                        addend += 4;
 944                if (tunnel->parms.o_flags&GRE_KEY)
 945                        addend += 4;
 946                if (tunnel->parms.o_flags&GRE_SEQ)
 947                        addend += 4;
 948        }
 949        dev->needed_headroom = addend + hlen;
 950        mtu -= dev->hard_header_len + addend;
 951
 952        if (mtu < 68)
 953                mtu = 68;
 954
 955        tunnel->hlen = addend;
 956
 957        return mtu;
 958}
 959
 960static int
 961ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
 962{
 963        int err = 0;
 964        struct ip_tunnel_parm p;
 965        struct ip_tunnel *t;
 966        struct net *net = dev_net(dev);
 967        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 968
 969        switch (cmd) {
 970        case SIOCGETTUNNEL:
 971                t = NULL;
 972                if (dev == ign->fb_tunnel_dev) {
 973                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
 974                                err = -EFAULT;
 975                                break;
 976                        }
 977                        t = ipgre_tunnel_locate(net, &p, 0);
 978                }
 979                if (t == NULL)
 980                        t = netdev_priv(dev);
 981                memcpy(&p, &t->parms, sizeof(p));
 982                if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
 983                        err = -EFAULT;
 984                break;
 985
 986        case SIOCADDTUNNEL:
 987        case SIOCCHGTUNNEL:
 988                err = -EPERM;
 989                if (!capable(CAP_NET_ADMIN))
 990                        goto done;
 991
 992                err = -EFAULT;
 993                if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
 994                        goto done;
 995
 996                err = -EINVAL;
 997                if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
 998                    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
 999                    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1000                        goto done;
1001                if (p.iph.ttl)
1002                        p.iph.frag_off |= htons(IP_DF);
1003
1004                if (!(p.i_flags&GRE_KEY))
1005                        p.i_key = 0;
1006                if (!(p.o_flags&GRE_KEY))
1007                        p.o_key = 0;
1008
1009                t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1010
1011                if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1012                        if (t != NULL) {
1013                                if (t->dev != dev) {
1014                                        err = -EEXIST;
1015                                        break;
1016                                }
1017                        } else {
1018                                unsigned nflags = 0;
1019
1020                                t = netdev_priv(dev);
1021
1022                                if (ipv4_is_multicast(p.iph.daddr))
1023                                        nflags = IFF_BROADCAST;
1024                                else if (p.iph.daddr)
1025                                        nflags = IFF_POINTOPOINT;
1026
1027                                if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1028                                        err = -EINVAL;
1029                                        break;
1030                                }
1031                                ipgre_tunnel_unlink(ign, t);
1032                                t->parms.iph.saddr = p.iph.saddr;
1033                                t->parms.iph.daddr = p.iph.daddr;
1034                                t->parms.i_key = p.i_key;
1035                                t->parms.o_key = p.o_key;
1036                                memcpy(dev->dev_addr, &p.iph.saddr, 4);
1037                                memcpy(dev->broadcast, &p.iph.daddr, 4);
1038                                ipgre_tunnel_link(ign, t);
1039                                netdev_state_change(dev);
1040                        }
1041                }
1042
1043                if (t) {
1044                        err = 0;
1045                        if (cmd == SIOCCHGTUNNEL) {
1046                                t->parms.iph.ttl = p.iph.ttl;
1047                                t->parms.iph.tos = p.iph.tos;
1048                                t->parms.iph.frag_off = p.iph.frag_off;
1049                                if (t->parms.link != p.link) {
1050                                        t->parms.link = p.link;
1051                                        dev->mtu = ipgre_tunnel_bind_dev(dev);
1052                                        netdev_state_change(dev);
1053                                }
1054                        }
1055                        if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1056                                err = -EFAULT;
1057                } else
1058                        err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1059                break;
1060
1061        case SIOCDELTUNNEL:
1062                err = -EPERM;
1063                if (!capable(CAP_NET_ADMIN))
1064                        goto done;
1065
1066                if (dev == ign->fb_tunnel_dev) {
1067                        err = -EFAULT;
1068                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1069                                goto done;
1070                        err = -ENOENT;
1071                        if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1072                                goto done;
1073                        err = -EPERM;
1074                        if (t == netdev_priv(ign->fb_tunnel_dev))
1075                                goto done;
1076                        dev = t->dev;
1077                }
1078                unregister_netdevice(dev);
1079                err = 0;
1080                break;
1081
1082        default:
1083                err = -EINVAL;
1084        }
1085
1086done:
1087        return err;
1088}
1089
1090static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1091{
1092        struct ip_tunnel *tunnel = netdev_priv(dev);
1093        if (new_mtu < 68 ||
1094            new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1095                return -EINVAL;
1096        dev->mtu = new_mtu;
1097        return 0;
1098}
1099
1100/* Nice toy. Unfortunately, useless in real life :-)
1101   It allows to construct virtual multiprotocol broadcast "LAN"
1102   over the Internet, provided multicast routing is tuned.
1103
1104
1105   I have no idea was this bicycle invented before me,
1106   so that I had to set ARPHRD_IPGRE to a random value.
1107   I have an impression, that Cisco could make something similar,
1108   but this feature is apparently missing in IOS<=11.2(8).
1109
1110   I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1111   with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1112
1113   ping -t 255 224.66.66.66
1114
1115   If nobody answers, mbone does not work.
1116
1117   ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1118   ip addr add 10.66.66.<somewhat>/24 dev Universe
1119   ifconfig Universe up
1120   ifconfig Universe add fe80::<Your_real_addr>/10
1121   ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1122   ftp 10.66.66.66
1123   ...
1124   ftp fec0:6666:6666::193.233.7.65
1125   ...
1126
1127 */
1128
1129static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1130                        unsigned short type,
1131                        const void *daddr, const void *saddr, unsigned len)
1132{
1133        struct ip_tunnel *t = netdev_priv(dev);
1134        struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1135        __be16 *p = (__be16*)(iph+1);
1136
1137        memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1138        p[0]            = t->parms.o_flags;
1139        p[1]            = htons(type);
1140
1141        /*
1142         *      Set the source hardware address.
1143         */
1144
1145        if (saddr)
1146                memcpy(&iph->saddr, saddr, 4);
1147
1148        if (daddr) {
1149                memcpy(&iph->daddr, daddr, 4);
1150                return t->hlen;
1151        }
1152        if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1153                return t->hlen;
1154
1155        return -t->hlen;
1156}
1157
1158static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1159{
1160        struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1161        memcpy(haddr, &iph->saddr, 4);
1162        return 4;
1163}
1164
1165static const struct header_ops ipgre_header_ops = {
1166        .create = ipgre_header,
1167        .parse  = ipgre_header_parse,
1168};
1169
1170#ifdef CONFIG_NET_IPGRE_BROADCAST
1171static int ipgre_open(struct net_device *dev)
1172{
1173        struct ip_tunnel *t = netdev_priv(dev);
1174
1175        if (ipv4_is_multicast(t->parms.iph.daddr)) {
1176                struct flowi fl = { .oif = t->parms.link,
1177                                    .nl_u = { .ip4_u =
1178                                              { .daddr = t->parms.iph.daddr,
1179                                                .saddr = t->parms.iph.saddr,
1180                                                .tos = RT_TOS(t->parms.iph.tos) } },
1181                                    .proto = IPPROTO_GRE };
1182                struct rtable *rt;
1183                if (ip_route_output_key(dev_net(dev), &rt, &fl))
1184                        return -EADDRNOTAVAIL;
1185                dev = rt->u.dst.dev;
1186                ip_rt_put(rt);
1187                if (__in_dev_get_rtnl(dev) == NULL)
1188                        return -EADDRNOTAVAIL;
1189                t->mlink = dev->ifindex;
1190                ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1191        }
1192        return 0;
1193}
1194
1195static int ipgre_close(struct net_device *dev)
1196{
1197        struct ip_tunnel *t = netdev_priv(dev);
1198
1199        if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1200                struct in_device *in_dev;
1201                in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1202                if (in_dev) {
1203                        ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1204                        in_dev_put(in_dev);
1205                }
1206        }
1207        return 0;
1208}
1209
1210#endif
1211
1212static const struct net_device_ops ipgre_netdev_ops = {
1213        .ndo_init               = ipgre_tunnel_init,
1214        .ndo_uninit             = ipgre_tunnel_uninit,
1215#ifdef CONFIG_NET_IPGRE_BROADCAST
1216        .ndo_open               = ipgre_open,
1217        .ndo_stop               = ipgre_close,
1218#endif
1219        .ndo_start_xmit         = ipgre_tunnel_xmit,
1220        .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1221        .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1222};
1223
1224static void ipgre_tunnel_setup(struct net_device *dev)
1225{
1226        dev->netdev_ops         = &ipgre_netdev_ops;
1227        dev->destructor         = free_netdev;
1228
1229        dev->type               = ARPHRD_IPGRE;
1230        dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1231        dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1232        dev->flags              = IFF_NOARP;
1233        dev->iflink             = 0;
1234        dev->addr_len           = 4;
1235        dev->features           |= NETIF_F_NETNS_LOCAL;
1236        dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1237}
1238
1239static int ipgre_tunnel_init(struct net_device *dev)
1240{
1241        struct ip_tunnel *tunnel;
1242        struct iphdr *iph;
1243
1244        tunnel = netdev_priv(dev);
1245        iph = &tunnel->parms.iph;
1246
1247        tunnel->dev = dev;
1248        strcpy(tunnel->parms.name, dev->name);
1249
1250        memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1251        memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1252
1253        if (iph->daddr) {
1254#ifdef CONFIG_NET_IPGRE_BROADCAST
1255                if (ipv4_is_multicast(iph->daddr)) {
1256                        if (!iph->saddr)
1257                                return -EINVAL;
1258                        dev->flags = IFF_BROADCAST;
1259                        dev->header_ops = &ipgre_header_ops;
1260                }
1261#endif
1262        } else
1263                dev->header_ops = &ipgre_header_ops;
1264
1265        return 0;
1266}
1267
1268static void ipgre_fb_tunnel_init(struct net_device *dev)
1269{
1270        struct ip_tunnel *tunnel = netdev_priv(dev);
1271        struct iphdr *iph = &tunnel->parms.iph;
1272        struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1273
1274        tunnel->dev = dev;
1275        strcpy(tunnel->parms.name, dev->name);
1276
1277        iph->version            = 4;
1278        iph->protocol           = IPPROTO_GRE;
1279        iph->ihl                = 5;
1280        tunnel->hlen            = sizeof(struct iphdr) + 4;
1281
1282        dev_hold(dev);
1283        ign->tunnels_wc[0]      = tunnel;
1284}
1285
1286
1287static const struct net_protocol ipgre_protocol = {
1288        .handler        =       ipgre_rcv,
1289        .err_handler    =       ipgre_err,
1290        .netns_ok       =       1,
1291};
1292
1293static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1294{
1295        int prio;
1296
1297        for (prio = 0; prio < 4; prio++) {
1298                int h;
1299                for (h = 0; h < HASH_SIZE; h++) {
1300                        struct ip_tunnel *t = ign->tunnels[prio][h];
1301
1302                        while (t != NULL) {
1303                                unregister_netdevice_queue(t->dev, head);
1304                                t = t->next;
1305                        }
1306                }
1307        }
1308}
1309
1310static int ipgre_init_net(struct net *net)
1311{
1312        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1313        int err;
1314
1315        ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1316                                           ipgre_tunnel_setup);
1317        if (!ign->fb_tunnel_dev) {
1318                err = -ENOMEM;
1319                goto err_alloc_dev;
1320        }
1321        dev_net_set(ign->fb_tunnel_dev, net);
1322
1323        ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1324        ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1325
1326        if ((err = register_netdev(ign->fb_tunnel_dev)))
1327                goto err_reg_dev;
1328
1329        return 0;
1330
1331err_reg_dev:
1332        free_netdev(ign->fb_tunnel_dev);
1333err_alloc_dev:
1334        return err;
1335}
1336
1337static void ipgre_exit_net(struct net *net)
1338{
1339        struct ipgre_net *ign;
1340        LIST_HEAD(list);
1341
1342        ign = net_generic(net, ipgre_net_id);
1343        rtnl_lock();
1344        ipgre_destroy_tunnels(ign, &list);
1345        unregister_netdevice_many(&list);
1346        rtnl_unlock();
1347}
1348
1349static struct pernet_operations ipgre_net_ops = {
1350        .init = ipgre_init_net,
1351        .exit = ipgre_exit_net,
1352        .id   = &ipgre_net_id,
1353        .size = sizeof(struct ipgre_net),
1354};
1355
1356static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1357{
1358        __be16 flags;
1359
1360        if (!data)
1361                return 0;
1362
1363        flags = 0;
1364        if (data[IFLA_GRE_IFLAGS])
1365                flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1366        if (data[IFLA_GRE_OFLAGS])
1367                flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1368        if (flags & (GRE_VERSION|GRE_ROUTING))
1369                return -EINVAL;
1370
1371        return 0;
1372}
1373
1374static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1375{
1376        __be32 daddr;
1377
1378        if (tb[IFLA_ADDRESS]) {
1379                if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1380                        return -EINVAL;
1381                if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1382                        return -EADDRNOTAVAIL;
1383        }
1384
1385        if (!data)
1386                goto out;
1387
1388        if (data[IFLA_GRE_REMOTE]) {
1389                memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1390                if (!daddr)
1391                        return -EINVAL;
1392        }
1393
1394out:
1395        return ipgre_tunnel_validate(tb, data);
1396}
1397
1398static void ipgre_netlink_parms(struct nlattr *data[],
1399                                struct ip_tunnel_parm *parms)
1400{
1401        memset(parms, 0, sizeof(*parms));
1402
1403        parms->iph.protocol = IPPROTO_GRE;
1404
1405        if (!data)
1406                return;
1407
1408        if (data[IFLA_GRE_LINK])
1409                parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1410
1411        if (data[IFLA_GRE_IFLAGS])
1412                parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1413
1414        if (data[IFLA_GRE_OFLAGS])
1415                parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1416
1417        if (data[IFLA_GRE_IKEY])
1418                parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1419
1420        if (data[IFLA_GRE_OKEY])
1421                parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1422
1423        if (data[IFLA_GRE_LOCAL])
1424                parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1425
1426        if (data[IFLA_GRE_REMOTE])
1427                parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1428
1429        if (data[IFLA_GRE_TTL])
1430                parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1431
1432        if (data[IFLA_GRE_TOS])
1433                parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1434
1435        if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1436                parms->iph.frag_off = htons(IP_DF);
1437}
1438
1439static int ipgre_tap_init(struct net_device *dev)
1440{
1441        struct ip_tunnel *tunnel;
1442
1443        tunnel = netdev_priv(dev);
1444
1445        tunnel->dev = dev;
1446        strcpy(tunnel->parms.name, dev->name);
1447
1448        ipgre_tunnel_bind_dev(dev);
1449
1450        return 0;
1451}
1452
1453static const struct net_device_ops ipgre_tap_netdev_ops = {
1454        .ndo_init               = ipgre_tap_init,
1455        .ndo_uninit             = ipgre_tunnel_uninit,
1456        .ndo_start_xmit         = ipgre_tunnel_xmit,
1457        .ndo_set_mac_address    = eth_mac_addr,
1458        .ndo_validate_addr      = eth_validate_addr,
1459        .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1460};
1461
1462static void ipgre_tap_setup(struct net_device *dev)
1463{
1464
1465        ether_setup(dev);
1466
1467        dev->netdev_ops         = &ipgre_tap_netdev_ops;
1468        dev->destructor         = free_netdev;
1469
1470        dev->iflink             = 0;
1471        dev->features           |= NETIF_F_NETNS_LOCAL;
1472}
1473
1474static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1475                         struct nlattr *data[])
1476{
1477        struct ip_tunnel *nt;
1478        struct net *net = dev_net(dev);
1479        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1480        int mtu;
1481        int err;
1482
1483        nt = netdev_priv(dev);
1484        ipgre_netlink_parms(data, &nt->parms);
1485
1486        if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1487                return -EEXIST;
1488
1489        if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1490                random_ether_addr(dev->dev_addr);
1491
1492        mtu = ipgre_tunnel_bind_dev(dev);
1493        if (!tb[IFLA_MTU])
1494                dev->mtu = mtu;
1495
1496        err = register_netdevice(dev);
1497        if (err)
1498                goto out;
1499
1500        dev_hold(dev);
1501        ipgre_tunnel_link(ign, nt);
1502
1503out:
1504        return err;
1505}
1506
1507static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1508                            struct nlattr *data[])
1509{
1510        struct ip_tunnel *t, *nt;
1511        struct net *net = dev_net(dev);
1512        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1513        struct ip_tunnel_parm p;
1514        int mtu;
1515
1516        if (dev == ign->fb_tunnel_dev)
1517                return -EINVAL;
1518
1519        nt = netdev_priv(dev);
1520        ipgre_netlink_parms(data, &p);
1521
1522        t = ipgre_tunnel_locate(net, &p, 0);
1523
1524        if (t) {
1525                if (t->dev != dev)
1526                        return -EEXIST;
1527        } else {
1528                t = nt;
1529
1530                if (dev->type != ARPHRD_ETHER) {
1531                        unsigned nflags = 0;
1532
1533                        if (ipv4_is_multicast(p.iph.daddr))
1534                                nflags = IFF_BROADCAST;
1535                        else if (p.iph.daddr)
1536                                nflags = IFF_POINTOPOINT;
1537
1538                        if ((dev->flags ^ nflags) &
1539                            (IFF_POINTOPOINT | IFF_BROADCAST))
1540                                return -EINVAL;
1541                }
1542
1543                ipgre_tunnel_unlink(ign, t);
1544                t->parms.iph.saddr = p.iph.saddr;
1545                t->parms.iph.daddr = p.iph.daddr;
1546                t->parms.i_key = p.i_key;
1547                if (dev->type != ARPHRD_ETHER) {
1548                        memcpy(dev->dev_addr, &p.iph.saddr, 4);
1549                        memcpy(dev->broadcast, &p.iph.daddr, 4);
1550                }
1551                ipgre_tunnel_link(ign, t);
1552                netdev_state_change(dev);
1553        }
1554
1555        t->parms.o_key = p.o_key;
1556        t->parms.iph.ttl = p.iph.ttl;
1557        t->parms.iph.tos = p.iph.tos;
1558        t->parms.iph.frag_off = p.iph.frag_off;
1559
1560        if (t->parms.link != p.link) {
1561                t->parms.link = p.link;
1562                mtu = ipgre_tunnel_bind_dev(dev);
1563                if (!tb[IFLA_MTU])
1564                        dev->mtu = mtu;
1565                netdev_state_change(dev);
1566        }
1567
1568        return 0;
1569}
1570
1571static size_t ipgre_get_size(const struct net_device *dev)
1572{
1573        return
1574                /* IFLA_GRE_LINK */
1575                nla_total_size(4) +
1576                /* IFLA_GRE_IFLAGS */
1577                nla_total_size(2) +
1578                /* IFLA_GRE_OFLAGS */
1579                nla_total_size(2) +
1580                /* IFLA_GRE_IKEY */
1581                nla_total_size(4) +
1582                /* IFLA_GRE_OKEY */
1583                nla_total_size(4) +
1584                /* IFLA_GRE_LOCAL */
1585                nla_total_size(4) +
1586                /* IFLA_GRE_REMOTE */
1587                nla_total_size(4) +
1588                /* IFLA_GRE_TTL */
1589                nla_total_size(1) +
1590                /* IFLA_GRE_TOS */
1591                nla_total_size(1) +
1592                /* IFLA_GRE_PMTUDISC */
1593                nla_total_size(1) +
1594                0;
1595}
1596
1597static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1598{
1599        struct ip_tunnel *t = netdev_priv(dev);
1600        struct ip_tunnel_parm *p = &t->parms;
1601
1602        NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1603        NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1604        NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1605        NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1606        NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1607        NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1608        NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1609        NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1610        NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1611        NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1612
1613        return 0;
1614
1615nla_put_failure:
1616        return -EMSGSIZE;
1617}
1618
1619static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1620        [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1621        [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1622        [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1623        [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1624        [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1625        [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1626        [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1627        [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1628        [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1629        [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1630};
1631
1632static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1633        .kind           = "gre",
1634        .maxtype        = IFLA_GRE_MAX,
1635        .policy         = ipgre_policy,
1636        .priv_size      = sizeof(struct ip_tunnel),
1637        .setup          = ipgre_tunnel_setup,
1638        .validate       = ipgre_tunnel_validate,
1639        .newlink        = ipgre_newlink,
1640        .changelink     = ipgre_changelink,
1641        .get_size       = ipgre_get_size,
1642        .fill_info      = ipgre_fill_info,
1643};
1644
1645static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1646        .kind           = "gretap",
1647        .maxtype        = IFLA_GRE_MAX,
1648        .policy         = ipgre_policy,
1649        .priv_size      = sizeof(struct ip_tunnel),
1650        .setup          = ipgre_tap_setup,
1651        .validate       = ipgre_tap_validate,
1652        .newlink        = ipgre_newlink,
1653        .changelink     = ipgre_changelink,
1654        .get_size       = ipgre_get_size,
1655        .fill_info      = ipgre_fill_info,
1656};
1657
1658/*
1659 *      And now the modules code and kernel interface.
1660 */
1661
1662static int __init ipgre_init(void)
1663{
1664        int err;
1665
1666        printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1667
1668        if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1669                printk(KERN_INFO "ipgre init: can't add protocol\n");
1670                return -EAGAIN;
1671        }
1672
1673        err = register_pernet_device(&ipgre_net_ops);
1674        if (err < 0)
1675                goto gen_device_failed;
1676
1677        err = rtnl_link_register(&ipgre_link_ops);
1678        if (err < 0)
1679                goto rtnl_link_failed;
1680
1681        err = rtnl_link_register(&ipgre_tap_ops);
1682        if (err < 0)
1683                goto tap_ops_failed;
1684
1685out:
1686        return err;
1687
1688tap_ops_failed:
1689        rtnl_link_unregister(&ipgre_link_ops);
1690rtnl_link_failed:
1691        unregister_pernet_device(&ipgre_net_ops);
1692gen_device_failed:
1693        inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1694        goto out;
1695}
1696
1697static void __exit ipgre_fini(void)
1698{
1699        rtnl_link_unregister(&ipgre_tap_ops);
1700        rtnl_link_unregister(&ipgre_link_ops);
1701        unregister_pernet_device(&ipgre_net_ops);
1702        if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1703                printk(KERN_INFO "ipgre close: can't remove protocol\n");
1704}
1705
1706module_init(ipgre_init);
1707module_exit(ipgre_fini);
1708MODULE_LICENSE("GPL");
1709MODULE_ALIAS_RTNL_LINK("gre");
1710MODULE_ALIAS_RTNL_LINK("gretap");
1711
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.