linux/net/ipv4/ip_gre.c
<<
>>
Prefs
   1/*
   2 *      Linux NET3:     GRE over IP protocol decoder.
   3 *
   4 *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
   5 *
   6 *      This program is free software; you can redistribute it and/or
   7 *      modify it under the terms of the GNU General Public License
   8 *      as published by the Free Software Foundation; either version
   9 *      2 of the License, or (at your option) any later version.
  10 *
  11 */
  12
  13#include <linux/capability.h>
  14#include <linux/module.h>
  15#include <linux/types.h>
  16#include <linux/kernel.h>
  17#include <asm/uaccess.h>
  18#include <linux/skbuff.h>
  19#include <linux/netdevice.h>
  20#include <linux/in.h>
  21#include <linux/tcp.h>
  22#include <linux/udp.h>
  23#include <linux/if_arp.h>
  24#include <linux/mroute.h>
  25#include <linux/init.h>
  26#include <linux/in6.h>
  27#include <linux/inetdevice.h>
  28#include <linux/igmp.h>
  29#include <linux/netfilter_ipv4.h>
  30#include <linux/etherdevice.h>
  31#include <linux/if_ether.h>
  32
  33#include <net/sock.h>
  34#include <net/ip.h>
  35#include <net/icmp.h>
  36#include <net/protocol.h>
  37#include <net/ipip.h>
  38#include <net/arp.h>
  39#include <net/checksum.h>
  40#include <net/dsfield.h>
  41#include <net/inet_ecn.h>
  42#include <net/xfrm.h>
  43#include <net/net_namespace.h>
  44#include <net/netns/generic.h>
  45#include <net/rtnetlink.h>
  46
  47#ifdef CONFIG_IPV6
  48#include <net/ipv6.h>
  49#include <net/ip6_fib.h>
  50#include <net/ip6_route.h>
  51#endif
  52
  53/*
  54   Problems & solutions
  55   --------------------
  56
  57   1. The most important issue is detecting local dead loops.
  58   They would cause complete host lockup in transmit, which
  59   would be "resolved" by stack overflow or, if queueing is enabled,
  60   with infinite looping in net_bh.
  61
  62   We cannot track such dead loops during route installation,
  63   it is infeasible task. The most general solutions would be
  64   to keep skb->encapsulation counter (sort of local ttl),
  65   and silently drop packet when it expires. It is the best
  66   solution, but it supposes maintaing new variable in ALL
  67   skb, even if no tunneling is used.
  68
  69   Current solution: t->recursion lock breaks dead loops. It looks
  70   like dev->tbusy flag, but I preferred new variable, because
  71   the semantics is different. One day, when hard_start_xmit
  72   will be multithreaded we will have to use skb->encapsulation.
  73
  74
  75
  76   2. Networking dead loops would not kill routers, but would really
  77   kill network. IP hop limit plays role of "t->recursion" in this case,
  78   if we copy it from packet being encapsulated to upper header.
  79   It is very good solution, but it introduces two problems:
  80
  81   - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
  82     do not work over tunnels.
  83   - traceroute does not work. I planned to relay ICMP from tunnel,
  84     so that this problem would be solved and traceroute output
  85     would even more informative. This idea appeared to be wrong:
  86     only Linux complies to rfc1812 now (yes, guys, Linux is the only
  87     true router now :-)), all routers (at least, in neighbourhood of mine)
  88     return only 8 bytes of payload. It is the end.
  89
  90   Hence, if we want that OSPF worked or traceroute said something reasonable,
  91   we should search for another solution.
  92
  93   One of them is to parse packet trying to detect inner encapsulation
  94   made by our node. It is difficult or even impossible, especially,
  95   taking into account fragmentation. TO be short, tt is not solution at all.
  96
  97   Current solution: The solution was UNEXPECTEDLY SIMPLE.
  98   We force DF flag on tunnels with preconfigured hop limit,
  99   that is ALL. :-) Well, it does not remove the problem completely,
 100   but exponential growth of network traffic is changed to linear
 101   (branches, that exceed pmtu are pruned) and tunnel mtu
 102   fastly degrades to value <68, where looping stops.
 103   Yes, it is not good if there exists a router in the loop,
 104   which does not force DF, even when encapsulating packets have DF set.
 105   But it is not our problem! Nobody could accuse us, we made
 106   all that we could make. Even if it is your gated who injected
 107   fatal route to network, even if it were you who configured
 108   fatal static route: you are innocent. :-)
 109
 110
 111
 112   3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
 113   practically identical code. It would be good to glue them
 114   together, but it is not very evident, how to make them modular.
 115   sit is integral part of IPv6, ipip and gre are naturally modular.
 116   We could extract common parts (hash table, ioctl etc)
 117   to a separate module (ip_tunnel.c).
 118
 119   Alexey Kuznetsov.
 120 */
 121
 122static struct rtnl_link_ops ipgre_link_ops __read_mostly;
 123static int ipgre_tunnel_init(struct net_device *dev);
 124static void ipgre_tunnel_setup(struct net_device *dev);
 125static int ipgre_tunnel_bind_dev(struct net_device *dev);
 126
 127/* Fallback tunnel: no source, no destination, no key, no options */
 128
 129#define HASH_SIZE  16
 130
 131static int ipgre_net_id;
 132struct ipgre_net {
 133        struct ip_tunnel *tunnels[4][HASH_SIZE];
 134
 135        struct net_device *fb_tunnel_dev;
 136};
 137
 138/* Tunnel hash table */
 139
 140/*
 141   4 hash tables:
 142
 143   3: (remote,local)
 144   2: (remote,*)
 145   1: (*,local)
 146   0: (*,*)
 147
 148   We require exact key match i.e. if a key is present in packet
 149   it will match only tunnel with the same key; if it is not present,
 150   it will match only keyless tunnel.
 151
 152   All keysless packets, if not matched configured keyless tunnels
 153   will match fallback tunnel.
 154 */
 155
 156#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
 157
 158#define tunnels_r_l     tunnels[3]
 159#define tunnels_r       tunnels[2]
 160#define tunnels_l       tunnels[1]
 161#define tunnels_wc      tunnels[0]
 162
 163static DEFINE_RWLOCK(ipgre_lock);
 164
 165/* Given src, dst and key, find appropriate for input tunnel. */
 166
 167static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
 168                                              __be32 remote, __be32 local,
 169                                              __be32 key, __be16 gre_proto)
 170{
 171        struct net *net = dev_net(dev);
 172        int link = dev->ifindex;
 173        unsigned h0 = HASH(remote);
 174        unsigned h1 = HASH(key);
 175        struct ip_tunnel *t, *cand = NULL;
 176        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 177        int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
 178                       ARPHRD_ETHER : ARPHRD_IPGRE;
 179        int score, cand_score = 4;
 180
 181        for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
 182                if (local != t->parms.iph.saddr ||
 183                    remote != t->parms.iph.daddr ||
 184                    key != t->parms.i_key ||
 185                    !(t->dev->flags & IFF_UP))
 186                        continue;
 187
 188                if (t->dev->type != ARPHRD_IPGRE &&
 189                    t->dev->type != dev_type)
 190                        continue;
 191
 192                score = 0;
 193                if (t->parms.link != link)
 194                        score |= 1;
 195                if (t->dev->type != dev_type)
 196                        score |= 2;
 197                if (score == 0)
 198                        return t;
 199
 200                if (score < cand_score) {
 201                        cand = t;
 202                        cand_score = score;
 203                }
 204        }
 205
 206        for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
 207                if (remote != t->parms.iph.daddr ||
 208                    key != t->parms.i_key ||
 209                    !(t->dev->flags & IFF_UP))
 210                        continue;
 211
 212                if (t->dev->type != ARPHRD_IPGRE &&
 213                    t->dev->type != dev_type)
 214                        continue;
 215
 216                score = 0;
 217                if (t->parms.link != link)
 218                        score |= 1;
 219                if (t->dev->type != dev_type)
 220                        score |= 2;
 221                if (score == 0)
 222                        return t;
 223
 224                if (score < cand_score) {
 225                        cand = t;
 226                        cand_score = score;
 227                }
 228        }
 229
 230        for (t = ign->tunnels_l[h1]; t; t = t->next) {
 231                if ((local != t->parms.iph.saddr &&
 232                     (local != t->parms.iph.daddr ||
 233                      !ipv4_is_multicast(local))) ||
 234                    key != t->parms.i_key ||
 235                    !(t->dev->flags & IFF_UP))
 236                        continue;
 237
 238                if (t->dev->type != ARPHRD_IPGRE &&
 239                    t->dev->type != dev_type)
 240                        continue;
 241
 242                score = 0;
 243                if (t->parms.link != link)
 244                        score |= 1;
 245                if (t->dev->type != dev_type)
 246                        score |= 2;
 247                if (score == 0)
 248                        return t;
 249
 250                if (score < cand_score) {
 251                        cand = t;
 252                        cand_score = score;
 253                }
 254        }
 255
 256        for (t = ign->tunnels_wc[h1]; t; t = t->next) {
 257                if (t->parms.i_key != key ||
 258                    !(t->dev->flags & IFF_UP))
 259                        continue;
 260
 261                if (t->dev->type != ARPHRD_IPGRE &&
 262                    t->dev->type != dev_type)
 263                        continue;
 264
 265                score = 0;
 266                if (t->parms.link != link)
 267                        score |= 1;
 268                if (t->dev->type != dev_type)
 269                        score |= 2;
 270                if (score == 0)
 271                        return t;
 272
 273                if (score < cand_score) {
 274                        cand = t;
 275                        cand_score = score;
 276                }
 277        }
 278
 279        if (cand != NULL)
 280                return cand;
 281
 282        if (ign->fb_tunnel_dev->flags & IFF_UP)
 283                return netdev_priv(ign->fb_tunnel_dev);
 284
 285        return NULL;
 286}
 287
 288static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
 289                struct ip_tunnel_parm *parms)
 290{
 291        __be32 remote = parms->iph.daddr;
 292        __be32 local = parms->iph.saddr;
 293        __be32 key = parms->i_key;
 294        unsigned h = HASH(key);
 295        int prio = 0;
 296
 297        if (local)
 298                prio |= 1;
 299        if (remote && !ipv4_is_multicast(remote)) {
 300                prio |= 2;
 301                h ^= HASH(remote);
 302        }
 303
 304        return &ign->tunnels[prio][h];
 305}
 306
 307static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
 308                struct ip_tunnel *t)
 309{
 310        return __ipgre_bucket(ign, &t->parms);
 311}
 312
 313static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
 314{
 315        struct ip_tunnel **tp = ipgre_bucket(ign, t);
 316
 317        t->next = *tp;
 318        write_lock_bh(&ipgre_lock);
 319        *tp = t;
 320        write_unlock_bh(&ipgre_lock);
 321}
 322
 323static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
 324{
 325        struct ip_tunnel **tp;
 326
 327        for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
 328                if (t == *tp) {
 329                        write_lock_bh(&ipgre_lock);
 330                        *tp = t->next;
 331                        write_unlock_bh(&ipgre_lock);
 332                        break;
 333                }
 334        }
 335}
 336
 337static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
 338                                           struct ip_tunnel_parm *parms,
 339                                           int type)
 340{
 341        __be32 remote = parms->iph.daddr;
 342        __be32 local = parms->iph.saddr;
 343        __be32 key = parms->i_key;
 344        int link = parms->link;
 345        struct ip_tunnel *t, **tp;
 346        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 347
 348        for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
 349                if (local == t->parms.iph.saddr &&
 350                    remote == t->parms.iph.daddr &&
 351                    key == t->parms.i_key &&
 352                    link == t->parms.link &&
 353                    type == t->dev->type)
 354                        break;
 355
 356        return t;
 357}
 358
 359static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
 360                struct ip_tunnel_parm *parms, int create)
 361{
 362        struct ip_tunnel *t, *nt;
 363        struct net_device *dev;
 364        char name[IFNAMSIZ];
 365        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 366
 367        t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
 368        if (t || !create)
 369                return t;
 370
 371        if (parms->name[0])
 372                strlcpy(name, parms->name, IFNAMSIZ);
 373        else
 374                sprintf(name, "gre%%d");
 375
 376        dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
 377        if (!dev)
 378          return NULL;
 379
 380        dev_net_set(dev, net);
 381
 382        if (strchr(name, '%')) {
 383                if (dev_alloc_name(dev, name) < 0)
 384                        goto failed_free;
 385        }
 386
 387        nt = netdev_priv(dev);
 388        nt->parms = *parms;
 389        dev->rtnl_link_ops = &ipgre_link_ops;
 390
 391        dev->mtu = ipgre_tunnel_bind_dev(dev);
 392
 393        if (register_netdevice(dev) < 0)
 394                goto failed_free;
 395
 396        dev_hold(dev);
 397        ipgre_tunnel_link(ign, nt);
 398        return nt;
 399
 400failed_free:
 401        free_netdev(dev);
 402        return NULL;
 403}
 404
 405static void ipgre_tunnel_uninit(struct net_device *dev)
 406{
 407        struct net *net = dev_net(dev);
 408        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 409
 410        ipgre_tunnel_unlink(ign, netdev_priv(dev));
 411        dev_put(dev);
 412}
 413
 414
 415static void ipgre_err(struct sk_buff *skb, u32 info)
 416{
 417
 418/* All the routers (except for Linux) return only
 419   8 bytes of packet payload. It means, that precise relaying of
 420   ICMP in the real Internet is absolutely infeasible.
 421
 422   Moreover, Cisco "wise men" put GRE key to the third word
 423   in GRE header. It makes impossible maintaining even soft state for keyed
 424   GRE tunnels with enabled checksum. Tell them "thank you".
 425
 426   Well, I wonder, rfc1812 was written by Cisco employee,
 427   what the hell these idiots break standrads established
 428   by themself???
 429 */
 430
 431        struct iphdr *iph = (struct iphdr *)skb->data;
 432        __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
 433        int grehlen = (iph->ihl<<2) + 4;
 434        const int type = icmp_hdr(skb)->type;
 435        const int code = icmp_hdr(skb)->code;
 436        struct ip_tunnel *t;
 437        __be16 flags;
 438
 439        flags = p[0];
 440        if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
 441                if (flags&(GRE_VERSION|GRE_ROUTING))
 442                        return;
 443                if (flags&GRE_KEY) {
 444                        grehlen += 4;
 445                        if (flags&GRE_CSUM)
 446                                grehlen += 4;
 447                }
 448        }
 449
 450        /* If only 8 bytes returned, keyed message will be dropped here */
 451        if (skb_headlen(skb) < grehlen)
 452                return;
 453
 454        switch (type) {
 455        default:
 456        case ICMP_PARAMETERPROB:
 457                return;
 458
 459        case ICMP_DEST_UNREACH:
 460                switch (code) {
 461                case ICMP_SR_FAILED:
 462                case ICMP_PORT_UNREACH:
 463                        /* Impossible event. */
 464                        return;
 465                case ICMP_FRAG_NEEDED:
 466                        /* Soft state for pmtu is maintained by IP core. */
 467                        return;
 468                default:
 469                        /* All others are translated to HOST_UNREACH.
 470                           rfc2003 contains "deep thoughts" about NET_UNREACH,
 471                           I believe they are just ether pollution. --ANK
 472                         */
 473                        break;
 474                }
 475                break;
 476        case ICMP_TIME_EXCEEDED:
 477                if (code != ICMP_EXC_TTL)
 478                        return;
 479                break;
 480        }
 481
 482        read_lock(&ipgre_lock);
 483        t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
 484                                flags & GRE_KEY ?
 485                                *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
 486                                p[1]);
 487        if (t == NULL || t->parms.iph.daddr == 0 ||
 488            ipv4_is_multicast(t->parms.iph.daddr))
 489                goto out;
 490
 491        if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
 492                goto out;
 493
 494        if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
 495                t->err_count++;
 496        else
 497                t->err_count = 1;
 498        t->err_time = jiffies;
 499out:
 500        read_unlock(&ipgre_lock);
 501        return;
 502}
 503
 504static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
 505{
 506        if (INET_ECN_is_ce(iph->tos)) {
 507                if (skb->protocol == htons(ETH_P_IP)) {
 508                        IP_ECN_set_ce(ip_hdr(skb));
 509                } else if (skb->protocol == htons(ETH_P_IPV6)) {
 510                        IP6_ECN_set_ce(ipv6_hdr(skb));
 511                }
 512        }
 513}
 514
 515static inline u8
 516ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
 517{
 518        u8 inner = 0;
 519        if (skb->protocol == htons(ETH_P_IP))
 520                inner = old_iph->tos;
 521        else if (skb->protocol == htons(ETH_P_IPV6))
 522                inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
 523        return INET_ECN_encapsulate(tos, inner);
 524}
 525
 526static int ipgre_rcv(struct sk_buff *skb)
 527{
 528        struct iphdr *iph;
 529        u8     *h;
 530        __be16    flags;
 531        __sum16   csum = 0;
 532        __be32 key = 0;
 533        u32    seqno = 0;
 534        struct ip_tunnel *tunnel;
 535        int    offset = 4;
 536        __be16 gre_proto;
 537        unsigned int len;
 538
 539        if (!pskb_may_pull(skb, 16))
 540                goto drop_nolock;
 541
 542        iph = ip_hdr(skb);
 543        h = skb->data;
 544        flags = *(__be16*)h;
 545
 546        if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
 547                /* - Version must be 0.
 548                   - We do not support routing headers.
 549                 */
 550                if (flags&(GRE_VERSION|GRE_ROUTING))
 551                        goto drop_nolock;
 552
 553                if (flags&GRE_CSUM) {
 554                        switch (skb->ip_summed) {
 555                        case CHECKSUM_COMPLETE:
 556                                csum = csum_fold(skb->csum);
 557                                if (!csum)
 558                                        break;
 559                                /* fall through */
 560                        case CHECKSUM_NONE:
 561                                skb->csum = 0;
 562                                csum = __skb_checksum_complete(skb);
 563                                skb->ip_summed = CHECKSUM_COMPLETE;
 564                        }
 565                        offset += 4;
 566                }
 567                if (flags&GRE_KEY) {
 568                        key = *(__be32*)(h + offset);
 569                        offset += 4;
 570                }
 571                if (flags&GRE_SEQ) {
 572                        seqno = ntohl(*(__be32*)(h + offset));
 573                        offset += 4;
 574                }
 575        }
 576
 577        gre_proto = *(__be16 *)(h + 2);
 578
 579        read_lock(&ipgre_lock);
 580        if ((tunnel = ipgre_tunnel_lookup(skb->dev,
 581                                          iph->saddr, iph->daddr, key,
 582                                          gre_proto))) {
 583                struct net_device_stats *stats = &tunnel->dev->stats;
 584
 585                secpath_reset(skb);
 586
 587                skb->protocol = gre_proto;
 588                /* WCCP version 1 and 2 protocol decoding.
 589                 * - Change protocol to IP
 590                 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
 591                 */
 592                if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
 593                        skb->protocol = htons(ETH_P_IP);
 594                        if ((*(h + offset) & 0xF0) != 0x40)
 595                                offset += 4;
 596                }
 597
 598                skb->mac_header = skb->network_header;
 599                __pskb_pull(skb, offset);
 600                skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
 601                skb->pkt_type = PACKET_HOST;
 602#ifdef CONFIG_NET_IPGRE_BROADCAST
 603                if (ipv4_is_multicast(iph->daddr)) {
 604                        /* Looped back packet, drop it! */
 605                        if (skb_rtable(skb)->fl.iif == 0)
 606                                goto drop;
 607                        stats->multicast++;
 608                        skb->pkt_type = PACKET_BROADCAST;
 609                }
 610#endif
 611
 612                if (((flags&GRE_CSUM) && csum) ||
 613                    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
 614                        stats->rx_crc_errors++;
 615                        stats->rx_errors++;
 616                        goto drop;
 617                }
 618                if (tunnel->parms.i_flags&GRE_SEQ) {
 619                        if (!(flags&GRE_SEQ) ||
 620                            (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
 621                                stats->rx_fifo_errors++;
 622                                stats->rx_errors++;
 623                                goto drop;
 624                        }
 625                        tunnel->i_seqno = seqno + 1;
 626                }
 627
 628                len = skb->len;
 629
 630                /* Warning: All skb pointers will be invalidated! */
 631                if (tunnel->dev->type == ARPHRD_ETHER) {
 632                        if (!pskb_may_pull(skb, ETH_HLEN)) {
 633                                stats->rx_length_errors++;
 634                                stats->rx_errors++;
 635                                goto drop;
 636                        }
 637
 638                        iph = ip_hdr(skb);
 639                        skb->protocol = eth_type_trans(skb, tunnel->dev);
 640                        skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 641                }
 642
 643                stats->rx_packets++;
 644                stats->rx_bytes += len;
 645                skb->dev = tunnel->dev;
 646                skb_dst_drop(skb);
 647                nf_reset(skb);
 648
 649                skb_reset_network_header(skb);
 650                ipgre_ecn_decapsulate(iph, skb);
 651
 652                netif_rx(skb);
 653                read_unlock(&ipgre_lock);
 654                return(0);
 655        }
 656        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 657
 658drop:
 659        read_unlock(&ipgre_lock);
 660drop_nolock:
 661        kfree_skb(skb);
 662        return(0);
 663}
 664
 665static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 666{
 667        struct ip_tunnel *tunnel = netdev_priv(dev);
 668        struct net_device_stats *stats = &tunnel->dev->stats;
 669        struct iphdr  *old_iph = ip_hdr(skb);
 670        struct iphdr  *tiph;
 671        u8     tos;
 672        __be16 df;
 673        struct rtable *rt;                      /* Route to the other host */
 674        struct net_device *tdev;                        /* Device to other host */
 675        struct iphdr  *iph;                     /* Our new IP header */
 676        unsigned int max_headroom;              /* The extra header space needed */
 677        int    gre_hlen;
 678        __be32 dst;
 679        int    mtu;
 680
 681        if (tunnel->recursion++) {
 682                stats->collisions++;
 683                goto tx_error;
 684        }
 685
 686        if (dev->type == ARPHRD_ETHER)
 687                IPCB(skb)->flags = 0;
 688
 689        if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
 690                gre_hlen = 0;
 691                tiph = (struct iphdr *)skb->data;
 692        } else {
 693                gre_hlen = tunnel->hlen;
 694                tiph = &tunnel->parms.iph;
 695        }
 696
 697        if ((dst = tiph->daddr) == 0) {
 698                /* NBMA tunnel */
 699
 700                if (skb_dst(skb) == NULL) {
 701                        stats->tx_fifo_errors++;
 702                        goto tx_error;
 703                }
 704
 705                if (skb->protocol == htons(ETH_P_IP)) {
 706                        rt = skb_rtable(skb);
 707                        if ((dst = rt->rt_gateway) == 0)
 708                                goto tx_error_icmp;
 709                }
 710#ifdef CONFIG_IPV6
 711                else if (skb->protocol == htons(ETH_P_IPV6)) {
 712                        struct in6_addr *addr6;
 713                        int addr_type;
 714                        struct neighbour *neigh = skb_dst(skb)->neighbour;
 715
 716                        if (neigh == NULL)
 717                                goto tx_error;
 718
 719                        addr6 = (struct in6_addr *)&neigh->primary_key;
 720                        addr_type = ipv6_addr_type(addr6);
 721
 722                        if (addr_type == IPV6_ADDR_ANY) {
 723                                addr6 = &ipv6_hdr(skb)->daddr;
 724                                addr_type = ipv6_addr_type(addr6);
 725                        }
 726
 727                        if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
 728                                goto tx_error_icmp;
 729
 730                        dst = addr6->s6_addr32[3];
 731                }
 732#endif
 733                else
 734                        goto tx_error;
 735        }
 736
 737        tos = tiph->tos;
 738        if (tos == 1) {
 739                tos = 0;
 740                if (skb->protocol == htons(ETH_P_IP))
 741                        tos = old_iph->tos;
 742        }
 743
 744        {
 745                struct flowi fl = { .oif = tunnel->parms.link,
 746                                    .nl_u = { .ip4_u =
 747                                              { .daddr = dst,
 748                                                .saddr = tiph->saddr,
 749                                                .tos = RT_TOS(tos) } },
 750                                    .proto = IPPROTO_GRE };
 751                if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
 752                        stats->tx_carrier_errors++;
 753                        goto tx_error;
 754                }
 755        }
 756        tdev = rt->u.dst.dev;
 757
 758        if (tdev == dev) {
 759                ip_rt_put(rt);
 760                stats->collisions++;
 761                goto tx_error;
 762        }
 763
 764        df = tiph->frag_off;
 765        if (df)
 766                mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
 767        else
 768                mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
 769
 770        if (skb_dst(skb))
 771                skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
 772
 773        if (skb->protocol == htons(ETH_P_IP)) {
 774                df |= (old_iph->frag_off&htons(IP_DF));
 775
 776                if ((old_iph->frag_off&htons(IP_DF)) &&
 777                    mtu < ntohs(old_iph->tot_len)) {
 778                        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
 779                        ip_rt_put(rt);
 780                        goto tx_error;
 781                }
 782        }
 783#ifdef CONFIG_IPV6
 784        else if (skb->protocol == htons(ETH_P_IPV6)) {
 785                struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
 786
 787                if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
 788                        if ((tunnel->parms.iph.daddr &&
 789                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
 790                            rt6->rt6i_dst.plen == 128) {
 791                                rt6->rt6i_flags |= RTF_MODIFIED;
 792                                skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
 793                        }
 794                }
 795
 796                if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
 797                        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
 798                        ip_rt_put(rt);
 799                        goto tx_error;
 800                }
 801        }
 802#endif
 803
 804        if (tunnel->err_count > 0) {
 805                if (time_before(jiffies,
 806                                tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
 807                        tunnel->err_count--;
 808
 809                        dst_link_failure(skb);
 810                } else
 811                        tunnel->err_count = 0;
 812        }
 813
 814        max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
 815
 816        if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
 817            (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
 818                struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
 819                if (!new_skb) {
 820                        ip_rt_put(rt);
 821                        stats->tx_dropped++;
 822                        dev_kfree_skb(skb);
 823                        tunnel->recursion--;
 824                        return 0;
 825                }
 826                if (skb->sk)
 827                        skb_set_owner_w(new_skb, skb->sk);
 828                dev_kfree_skb(skb);
 829                skb = new_skb;
 830                old_iph = ip_hdr(skb);
 831        }
 832
 833        skb_reset_transport_header(skb);
 834        skb_push(skb, gre_hlen);
 835        skb_reset_network_header(skb);
 836        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 837        IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
 838                              IPSKB_REROUTED);
 839        skb_dst_drop(skb);
 840        skb_dst_set(skb, &rt->u.dst);
 841
 842        /*
 843         *      Push down and install the IPIP header.
 844         */
 845
 846        iph                     =       ip_hdr(skb);
 847        iph->version            =       4;
 848        iph->ihl                =       sizeof(struct iphdr) >> 2;
 849        iph->frag_off           =       df;
 850        iph->protocol           =       IPPROTO_GRE;
 851        iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
 852        iph->daddr              =       rt->rt_dst;
 853        iph->saddr              =       rt->rt_src;
 854
 855        if ((iph->ttl = tiph->ttl) == 0) {
 856                if (skb->protocol == htons(ETH_P_IP))
 857                        iph->ttl = old_iph->ttl;
 858#ifdef CONFIG_IPV6
 859                else if (skb->protocol == htons(ETH_P_IPV6))
 860                        iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
 861#endif
 862                else
 863                        iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
 864        }
 865
 866        ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
 867        ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
 868                                   htons(ETH_P_TEB) : skb->protocol;
 869
 870        if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
 871                __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
 872
 873                if (tunnel->parms.o_flags&GRE_SEQ) {
 874                        ++tunnel->o_seqno;
 875                        *ptr = htonl(tunnel->o_seqno);
 876                        ptr--;
 877                }
 878                if (tunnel->parms.o_flags&GRE_KEY) {
 879                        *ptr = tunnel->parms.o_key;
 880                        ptr--;
 881                }
 882                if (tunnel->parms.o_flags&GRE_CSUM) {
 883                        *ptr = 0;
 884                        *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
 885                }
 886        }
 887
 888        nf_reset(skb);
 889
 890        IPTUNNEL_XMIT();
 891        tunnel->recursion--;
 892        return 0;
 893
 894tx_error_icmp:
 895        dst_link_failure(skb);
 896
 897tx_error:
 898        stats->tx_errors++;
 899        dev_kfree_skb(skb);
 900        tunnel->recursion--;
 901        return 0;
 902}
 903
 904static int ipgre_tunnel_bind_dev(struct net_device *dev)
 905{
 906        struct net_device *tdev = NULL;
 907        struct ip_tunnel *tunnel;
 908        struct iphdr *iph;
 909        int hlen = LL_MAX_HEADER;
 910        int mtu = ETH_DATA_LEN;
 911        int addend = sizeof(struct iphdr) + 4;
 912
 913        tunnel = netdev_priv(dev);
 914        iph = &tunnel->parms.iph;
 915
 916        /* Guess output device to choose reasonable mtu and needed_headroom */
 917
 918        if (iph->daddr) {
 919                struct flowi fl = { .oif = tunnel->parms.link,
 920                                    .nl_u = { .ip4_u =
 921                                              { .daddr = iph->daddr,
 922                                                .saddr = iph->saddr,
 923                                                .tos = RT_TOS(iph->tos) } },
 924                                    .proto = IPPROTO_GRE };
 925                struct rtable *rt;
 926                if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
 927                        tdev = rt->u.dst.dev;
 928                        ip_rt_put(rt);
 929                }
 930
 931                if (dev->type != ARPHRD_ETHER)
 932                        dev->flags |= IFF_POINTOPOINT;
 933        }
 934
 935        if (!tdev && tunnel->parms.link)
 936                tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
 937
 938        if (tdev) {
 939                hlen = tdev->hard_header_len + tdev->needed_headroom;
 940                mtu = tdev->mtu;
 941        }
 942        dev->iflink = tunnel->parms.link;
 943
 944        /* Precalculate GRE options length */
 945        if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
 946                if (tunnel->parms.o_flags&GRE_CSUM)
 947                        addend += 4;
 948                if (tunnel->parms.o_flags&GRE_KEY)
 949                        addend += 4;
 950                if (tunnel->parms.o_flags&GRE_SEQ)
 951                        addend += 4;
 952        }
 953        dev->needed_headroom = addend + hlen;
 954        mtu -= dev->hard_header_len + addend;
 955
 956        if (mtu < 68)
 957                mtu = 68;
 958
 959        tunnel->hlen = addend;
 960
 961        return mtu;
 962}
 963
 964static int
 965ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
 966{
 967        int err = 0;
 968        struct ip_tunnel_parm p;
 969        struct ip_tunnel *t;
 970        struct net *net = dev_net(dev);
 971        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 972
 973        switch (cmd) {
 974        case SIOCGETTUNNEL:
 975                t = NULL;
 976                if (dev == ign->fb_tunnel_dev) {
 977                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
 978                                err = -EFAULT;
 979                                break;
 980                        }
 981                        t = ipgre_tunnel_locate(net, &p, 0);
 982                }
 983                if (t == NULL)
 984                        t = netdev_priv(dev);
 985                memcpy(&p, &t->parms, sizeof(p));
 986                if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
 987                        err = -EFAULT;
 988                break;
 989
 990        case SIOCADDTUNNEL:
 991        case SIOCCHGTUNNEL:
 992                err = -EPERM;
 993                if (!capable(CAP_NET_ADMIN))
 994                        goto done;
 995
 996                err = -EFAULT;
 997                if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
 998                        goto done;
 999
1000                err = -EINVAL;
1001                if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1002                    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1003                    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1004                        goto done;
1005                if (p.iph.ttl)
1006                        p.iph.frag_off |= htons(IP_DF);
1007
1008                if (!(p.i_flags&GRE_KEY))
1009                        p.i_key = 0;
1010                if (!(p.o_flags&GRE_KEY))
1011                        p.o_key = 0;
1012
1013                t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1014
1015                if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1016                        if (t != NULL) {
1017                                if (t->dev != dev) {
1018                                        err = -EEXIST;
1019                                        break;
1020                                }
1021                        } else {
1022                                unsigned nflags = 0;
1023
1024                                t = netdev_priv(dev);
1025
1026                                if (ipv4_is_multicast(p.iph.daddr))
1027                                        nflags = IFF_BROADCAST;
1028                                else if (p.iph.daddr)
1029                                        nflags = IFF_POINTOPOINT;
1030
1031                                if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1032                                        err = -EINVAL;
1033                                        break;
1034                                }
1035                                ipgre_tunnel_unlink(ign, t);
1036                                t->parms.iph.saddr = p.iph.saddr;
1037                                t->parms.iph.daddr = p.iph.daddr;
1038                                t->parms.i_key = p.i_key;
1039                                t->parms.o_key = p.o_key;
1040                                memcpy(dev->dev_addr, &p.iph.saddr, 4);
1041                                memcpy(dev->broadcast, &p.iph.daddr, 4);
1042                                ipgre_tunnel_link(ign, t);
1043                                netdev_state_change(dev);
1044                        }
1045                }
1046
1047                if (t) {
1048                        err = 0;
1049                        if (cmd == SIOCCHGTUNNEL) {
1050                                t->parms.iph.ttl = p.iph.ttl;
1051                                t->parms.iph.tos = p.iph.tos;
1052                                t->parms.iph.frag_off = p.iph.frag_off;
1053                                if (t->parms.link != p.link) {
1054                                        t->parms.link = p.link;
1055                                        dev->mtu = ipgre_tunnel_bind_dev(dev);
1056                                        netdev_state_change(dev);
1057                                }
1058                        }
1059                        if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1060                                err = -EFAULT;
1061                } else
1062                        err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1063                break;
1064
1065        case SIOCDELTUNNEL:
1066                err = -EPERM;
1067                if (!capable(CAP_NET_ADMIN))
1068                        goto done;
1069
1070                if (dev == ign->fb_tunnel_dev) {
1071                        err = -EFAULT;
1072                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1073                                goto done;
1074                        err = -ENOENT;
1075                        if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1076                                goto done;
1077                        err = -EPERM;
1078                        if (t == netdev_priv(ign->fb_tunnel_dev))
1079                                goto done;
1080                        dev = t->dev;
1081                }
1082                unregister_netdevice(dev);
1083                err = 0;
1084                break;
1085
1086        default:
1087                err = -EINVAL;
1088        }
1089
1090done:
1091        return err;
1092}
1093
1094static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1095{
1096        struct ip_tunnel *tunnel = netdev_priv(dev);
1097        if (new_mtu < 68 ||
1098            new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1099                return -EINVAL;
1100        dev->mtu = new_mtu;
1101        return 0;
1102}
1103
1104/* Nice toy. Unfortunately, useless in real life :-)
1105   It allows to construct virtual multiprotocol broadcast "LAN"
1106   over the Internet, provided multicast routing is tuned.
1107
1108
1109   I have no idea was this bicycle invented before me,
1110   so that I had to set ARPHRD_IPGRE to a random value.
1111   I have an impression, that Cisco could make something similar,
1112   but this feature is apparently missing in IOS<=11.2(8).
1113
1114   I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1115   with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1116
1117   ping -t 255 224.66.66.66
1118
1119   If nobody answers, mbone does not work.
1120
1121   ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1122   ip addr add 10.66.66.<somewhat>/24 dev Universe
1123   ifconfig Universe up
1124   ifconfig Universe add fe80::<Your_real_addr>/10
1125   ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1126   ftp 10.66.66.66
1127   ...
1128   ftp fec0:6666:6666::193.233.7.65
1129   ...
1130
1131 */
1132
1133static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1134                        unsigned short type,
1135                        const void *daddr, const void *saddr, unsigned len)
1136{
1137        struct ip_tunnel *t = netdev_priv(dev);
1138        struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1139        __be16 *p = (__be16*)(iph+1);
1140
1141        memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1142        p[0]            = t->parms.o_flags;
1143        p[1]            = htons(type);
1144
1145        /*
1146         *      Set the source hardware address.
1147         */
1148
1149        if (saddr)
1150                memcpy(&iph->saddr, saddr, 4);
1151
1152        if (daddr) {
1153                memcpy(&iph->daddr, daddr, 4);
1154                return t->hlen;
1155        }
1156        if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1157                return t->hlen;
1158
1159        return -t->hlen;
1160}
1161
1162static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1163{
1164        struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1165        memcpy(haddr, &iph->saddr, 4);
1166        return 4;
1167}
1168
1169static const struct header_ops ipgre_header_ops = {
1170        .create = ipgre_header,
1171        .parse  = ipgre_header_parse,
1172};
1173
1174#ifdef CONFIG_NET_IPGRE_BROADCAST
1175static int ipgre_open(struct net_device *dev)
1176{
1177        struct ip_tunnel *t = netdev_priv(dev);
1178
1179        if (ipv4_is_multicast(t->parms.iph.daddr)) {
1180                struct flowi fl = { .oif = t->parms.link,
1181                                    .nl_u = { .ip4_u =
1182                                              { .daddr = t->parms.iph.daddr,
1183                                                .saddr = t->parms.iph.saddr,
1184                                                .tos = RT_TOS(t->parms.iph.tos) } },
1185                                    .proto = IPPROTO_GRE };
1186                struct rtable *rt;
1187                if (ip_route_output_key(dev_net(dev), &rt, &fl))
1188                        return -EADDRNOTAVAIL;
1189                dev = rt->u.dst.dev;
1190                ip_rt_put(rt);
1191                if (__in_dev_get_rtnl(dev) == NULL)
1192                        return -EADDRNOTAVAIL;
1193                t->mlink = dev->ifindex;
1194                ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1195        }
1196        return 0;
1197}
1198
1199static int ipgre_close(struct net_device *dev)
1200{
1201        struct ip_tunnel *t = netdev_priv(dev);
1202
1203        if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1204                struct in_device *in_dev;
1205                in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1206                if (in_dev) {
1207                        ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1208                        in_dev_put(in_dev);
1209                }
1210        }
1211        return 0;
1212}
1213
1214#endif
1215
1216static const struct net_device_ops ipgre_netdev_ops = {
1217        .ndo_init               = ipgre_tunnel_init,
1218        .ndo_uninit             = ipgre_tunnel_uninit,
1219#ifdef CONFIG_NET_IPGRE_BROADCAST
1220        .ndo_open               = ipgre_open,
1221        .ndo_stop               = ipgre_close,
1222#endif
1223        .ndo_start_xmit         = ipgre_tunnel_xmit,
1224        .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1225        .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1226};
1227
1228static void ipgre_tunnel_setup(struct net_device *dev)
1229{
1230        dev->netdev_ops         = &ipgre_netdev_ops;
1231        dev->destructor         = free_netdev;
1232
1233        dev->type               = ARPHRD_IPGRE;
1234        dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1235        dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1236        dev->flags              = IFF_NOARP;
1237        dev->iflink             = 0;
1238        dev->addr_len           = 4;
1239        dev->features           |= NETIF_F_NETNS_LOCAL;
1240        dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1241}
1242
1243static int ipgre_tunnel_init(struct net_device *dev)
1244{
1245        struct ip_tunnel *tunnel;
1246        struct iphdr *iph;
1247
1248        tunnel = netdev_priv(dev);
1249        iph = &tunnel->parms.iph;
1250
1251        tunnel->dev = dev;
1252        strcpy(tunnel->parms.name, dev->name);
1253
1254        memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1255        memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1256
1257        if (iph->daddr) {
1258#ifdef CONFIG_NET_IPGRE_BROADCAST
1259                if (ipv4_is_multicast(iph->daddr)) {
1260                        if (!iph->saddr)
1261                                return -EINVAL;
1262                        dev->flags = IFF_BROADCAST;
1263                        dev->header_ops = &ipgre_header_ops;
1264                }
1265#endif
1266        } else
1267                dev->header_ops = &ipgre_header_ops;
1268
1269        return 0;
1270}
1271
1272static void ipgre_fb_tunnel_init(struct net_device *dev)
1273{
1274        struct ip_tunnel *tunnel = netdev_priv(dev);
1275        struct iphdr *iph = &tunnel->parms.iph;
1276        struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1277
1278        tunnel->dev = dev;
1279        strcpy(tunnel->parms.name, dev->name);
1280
1281        iph->version            = 4;
1282        iph->protocol           = IPPROTO_GRE;
1283        iph->ihl                = 5;
1284        tunnel->hlen            = sizeof(struct iphdr) + 4;
1285
1286        dev_hold(dev);
1287        ign->tunnels_wc[0]      = tunnel;
1288}
1289
1290
1291static struct net_protocol ipgre_protocol = {
1292        .handler        =       ipgre_rcv,
1293        .err_handler    =       ipgre_err,
1294        .netns_ok       =       1,
1295};
1296
1297static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1298{
1299        int prio;
1300
1301        for (prio = 0; prio < 4; prio++) {
1302                int h;
1303                for (h = 0; h < HASH_SIZE; h++) {
1304                        struct ip_tunnel *t;
1305                        while ((t = ign->tunnels[prio][h]) != NULL)
1306                                unregister_netdevice(t->dev);
1307                }
1308        }
1309}
1310
1311static int ipgre_init_net(struct net *net)
1312{
1313        int err;
1314        struct ipgre_net *ign;
1315
1316        err = -ENOMEM;
1317        ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1318        if (ign == NULL)
1319                goto err_alloc;
1320
1321        err = net_assign_generic(net, ipgre_net_id, ign);
1322        if (err < 0)
1323                goto err_assign;
1324
1325        ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1326                                           ipgre_tunnel_setup);
1327        if (!ign->fb_tunnel_dev) {
1328                err = -ENOMEM;
1329                goto err_alloc_dev;
1330        }
1331        dev_net_set(ign->fb_tunnel_dev, net);
1332
1333        ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1334        ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1335
1336        if ((err = register_netdev(ign->fb_tunnel_dev)))
1337                goto err_reg_dev;
1338
1339        return 0;
1340
1341err_reg_dev:
1342        free_netdev(ign->fb_tunnel_dev);
1343err_alloc_dev:
1344        /* nothing */
1345err_assign:
1346        kfree(ign);
1347err_alloc:
1348        return err;
1349}
1350
1351static void ipgre_exit_net(struct net *net)
1352{
1353        struct ipgre_net *ign;
1354
1355        ign = net_generic(net, ipgre_net_id);
1356        rtnl_lock();
1357        ipgre_destroy_tunnels(ign);
1358        rtnl_unlock();
1359        kfree(ign);
1360}
1361
1362static struct pernet_operations ipgre_net_ops = {
1363        .init = ipgre_init_net,
1364        .exit = ipgre_exit_net,
1365};
1366
1367static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1368{
1369        __be16 flags;
1370
1371        if (!data)
1372                return 0;
1373
1374        flags = 0;
1375        if (data[IFLA_GRE_IFLAGS])
1376                flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1377        if (data[IFLA_GRE_OFLAGS])
1378                flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1379        if (flags & (GRE_VERSION|GRE_ROUTING))
1380                return -EINVAL;
1381
1382        return 0;
1383}
1384
1385static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1386{
1387        __be32 daddr;
1388
1389        if (tb[IFLA_ADDRESS]) {
1390                if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1391                        return -EINVAL;
1392                if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1393                        return -EADDRNOTAVAIL;
1394        }
1395
1396        if (!data)
1397                goto out;
1398
1399        if (data[IFLA_GRE_REMOTE]) {
1400                memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1401                if (!daddr)
1402                        return -EINVAL;
1403        }
1404
1405out:
1406        return ipgre_tunnel_validate(tb, data);
1407}
1408
1409static void ipgre_netlink_parms(struct nlattr *data[],
1410                                struct ip_tunnel_parm *parms)
1411{
1412        memset(parms, 0, sizeof(*parms));
1413
1414        parms->iph.protocol = IPPROTO_GRE;
1415
1416        if (!data)
1417                return;
1418
1419        if (data[IFLA_GRE_LINK])
1420                parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1421
1422        if (data[IFLA_GRE_IFLAGS])
1423                parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1424
1425        if (data[IFLA_GRE_OFLAGS])
1426                parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1427
1428        if (data[IFLA_GRE_IKEY])
1429                parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1430
1431        if (data[IFLA_GRE_OKEY])
1432                parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1433
1434        if (data[IFLA_GRE_LOCAL])
1435                parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1436
1437        if (data[IFLA_GRE_REMOTE])
1438                parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1439
1440        if (data[IFLA_GRE_TTL])
1441                parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1442
1443        if (data[IFLA_GRE_TOS])
1444                parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1445
1446        if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1447                parms->iph.frag_off = htons(IP_DF);
1448}
1449
1450static int ipgre_tap_init(struct net_device *dev)
1451{
1452        struct ip_tunnel *tunnel;
1453
1454        tunnel = netdev_priv(dev);
1455
1456        tunnel->dev = dev;
1457        strcpy(tunnel->parms.name, dev->name);
1458
1459        ipgre_tunnel_bind_dev(dev);
1460
1461        return 0;
1462}
1463
1464static const struct net_device_ops ipgre_tap_netdev_ops = {
1465        .ndo_init               = ipgre_tap_init,
1466        .ndo_uninit             = ipgre_tunnel_uninit,
1467        .ndo_start_xmit         = ipgre_tunnel_xmit,
1468        .ndo_set_mac_address    = eth_mac_addr,
1469        .ndo_validate_addr      = eth_validate_addr,
1470        .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1471};
1472
1473static void ipgre_tap_setup(struct net_device *dev)
1474{
1475
1476        ether_setup(dev);
1477
1478        dev->netdev_ops         = &ipgre_netdev_ops;
1479        dev->destructor         = free_netdev;
1480
1481        dev->iflink             = 0;
1482        dev->features           |= NETIF_F_NETNS_LOCAL;
1483}
1484
1485static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
1486                         struct nlattr *data[])
1487{
1488        struct ip_tunnel *nt;
1489        struct net *net = dev_net(dev);
1490        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1491        int mtu;
1492        int err;
1493
1494        nt = netdev_priv(dev);
1495        ipgre_netlink_parms(data, &nt->parms);
1496
1497        if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1498                return -EEXIST;
1499
1500        if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1501                random_ether_addr(dev->dev_addr);
1502
1503        mtu = ipgre_tunnel_bind_dev(dev);
1504        if (!tb[IFLA_MTU])
1505                dev->mtu = mtu;
1506
1507        err = register_netdevice(dev);
1508        if (err)
1509                goto out;
1510
1511        dev_hold(dev);
1512        ipgre_tunnel_link(ign, nt);
1513
1514out:
1515        return err;
1516}
1517
1518static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1519                            struct nlattr *data[])
1520{
1521        struct ip_tunnel *t, *nt;
1522        struct net *net = dev_net(dev);
1523        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1524        struct ip_tunnel_parm p;
1525        int mtu;
1526
1527        if (dev == ign->fb_tunnel_dev)
1528                return -EINVAL;
1529
1530        nt = netdev_priv(dev);
1531        ipgre_netlink_parms(data, &p);
1532
1533        t = ipgre_tunnel_locate(net, &p, 0);
1534
1535        if (t) {
1536                if (t->dev != dev)
1537                        return -EEXIST;
1538        } else {
1539                unsigned nflags = 0;
1540
1541                t = nt;
1542
1543                if (ipv4_is_multicast(p.iph.daddr))
1544                        nflags = IFF_BROADCAST;
1545                else if (p.iph.daddr)
1546                        nflags = IFF_POINTOPOINT;
1547
1548                if ((dev->flags ^ nflags) &
1549                    (IFF_POINTOPOINT | IFF_BROADCAST))
1550                        return -EINVAL;
1551
1552                ipgre_tunnel_unlink(ign, t);
1553                t->parms.iph.saddr = p.iph.saddr;
1554                t->parms.iph.daddr = p.iph.daddr;
1555                t->parms.i_key = p.i_key;
1556                memcpy(dev->dev_addr, &p.iph.saddr, 4);
1557                memcpy(dev->broadcast, &p.iph.daddr, 4);
1558                ipgre_tunnel_link(ign, t);
1559                netdev_state_change(dev);
1560        }
1561
1562        t->parms.o_key = p.o_key;
1563        t->parms.iph.ttl = p.iph.ttl;
1564        t->parms.iph.tos = p.iph.tos;
1565        t->parms.iph.frag_off = p.iph.frag_off;
1566
1567        if (t->parms.link != p.link) {
1568                t->parms.link = p.link;
1569                mtu = ipgre_tunnel_bind_dev(dev);
1570                if (!tb[IFLA_MTU])
1571                        dev->mtu = mtu;
1572                netdev_state_change(dev);
1573        }
1574
1575        return 0;
1576}
1577
1578static size_t ipgre_get_size(const struct net_device *dev)
1579{
1580        return
1581                /* IFLA_GRE_LINK */
1582                nla_total_size(4) +
1583                /* IFLA_GRE_IFLAGS */
1584                nla_total_size(2) +
1585                /* IFLA_GRE_OFLAGS */
1586                nla_total_size(2) +
1587                /* IFLA_GRE_IKEY */
1588                nla_total_size(4) +
1589                /* IFLA_GRE_OKEY */
1590                nla_total_size(4) +
1591                /* IFLA_GRE_LOCAL */
1592                nla_total_size(4) +
1593                /* IFLA_GRE_REMOTE */
1594                nla_total_size(4) +
1595                /* IFLA_GRE_TTL */
1596                nla_total_size(1) +
1597                /* IFLA_GRE_TOS */
1598                nla_total_size(1) +
1599                /* IFLA_GRE_PMTUDISC */
1600                nla_total_size(1) +
1601                0;
1602}
1603
1604static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1605{
1606        struct ip_tunnel *t = netdev_priv(dev);
1607        struct ip_tunnel_parm *p = &t->parms;
1608
1609        NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1610        NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1611        NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1612        NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1613        NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1614        NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1615        NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1616        NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1617        NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1618        NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1619
1620        return 0;
1621
1622nla_put_failure:
1623        return -EMSGSIZE;
1624}
1625
1626static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1627        [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1628        [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1629        [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1630        [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1631        [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1632        [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1633        [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1634        [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1635        [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1636        [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1637};
1638
1639static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1640        .kind           = "gre",
1641        .maxtype        = IFLA_GRE_MAX,
1642        .policy         = ipgre_policy,
1643        .priv_size      = sizeof(struct ip_tunnel),
1644        .setup          = ipgre_tunnel_setup,
1645        .validate       = ipgre_tunnel_validate,
1646        .newlink        = ipgre_newlink,
1647        .changelink     = ipgre_changelink,
1648        .get_size       = ipgre_get_size,
1649        .fill_info      = ipgre_fill_info,
1650};
1651
1652static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1653        .kind           = "gretap",
1654        .maxtype        = IFLA_GRE_MAX,
1655        .policy         = ipgre_policy,
1656        .priv_size      = sizeof(struct ip_tunnel),
1657        .setup          = ipgre_tap_setup,
1658        .validate       = ipgre_tap_validate,
1659        .newlink        = ipgre_newlink,
1660        .changelink     = ipgre_changelink,
1661        .get_size       = ipgre_get_size,
1662        .fill_info      = ipgre_fill_info,
1663};
1664
1665/*
1666 *      And now the modules code and kernel interface.
1667 */
1668
1669static int __init ipgre_init(void)
1670{
1671        int err;
1672
1673        printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1674
1675        if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1676                printk(KERN_INFO "ipgre init: can't add protocol\n");
1677                return -EAGAIN;
1678        }
1679
1680        err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1681        if (err < 0)
1682                goto gen_device_failed;
1683
1684        err = rtnl_link_register(&ipgre_link_ops);
1685        if (err < 0)
1686                goto rtnl_link_failed;
1687
1688        err = rtnl_link_register(&ipgre_tap_ops);
1689        if (err < 0)
1690                goto tap_ops_failed;
1691
1692out:
1693        return err;
1694
1695tap_ops_failed:
1696        rtnl_link_unregister(&ipgre_link_ops);
1697rtnl_link_failed:
1698        unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1699gen_device_failed:
1700        inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1701        goto out;
1702}
1703
1704static void __exit ipgre_fini(void)
1705{
1706        rtnl_link_unregister(&ipgre_tap_ops);
1707        rtnl_link_unregister(&ipgre_link_ops);
1708        unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1709        if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1710                printk(KERN_INFO "ipgre close: can't remove protocol\n");
1711}
1712
1713module_init(ipgre_init);
1714module_exit(ipgre_fini);
1715MODULE_LICENSE("GPL");
1716MODULE_ALIAS_RTNL_LINK("gre");
1717MODULE_ALIAS_RTNL_LINK("gretap");
1718
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.