linux/net/ipv4/ip_gre.c
<<
>>
Prefs
   1/*
   2 *      Linux NET3:     GRE over IP protocol decoder.
   3 *
   4 *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
   5 *
   6 *      This program is free software; you can redistribute it and/or
   7 *      modify it under the terms of the GNU General Public License
   8 *      as published by the Free Software Foundation; either version
   9 *      2 of the License, or (at your option) any later version.
  10 *
  11 */
  12
  13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  14
  15#include <linux/capability.h>
  16#include <linux/module.h>
  17#include <linux/types.h>
  18#include <linux/kernel.h>
  19#include <linux/slab.h>
  20#include <asm/uaccess.h>
  21#include <linux/skbuff.h>
  22#include <linux/netdevice.h>
  23#include <linux/in.h>
  24#include <linux/tcp.h>
  25#include <linux/udp.h>
  26#include <linux/if_arp.h>
  27#include <linux/mroute.h>
  28#include <linux/init.h>
  29#include <linux/in6.h>
  30#include <linux/inetdevice.h>
  31#include <linux/igmp.h>
  32#include <linux/netfilter_ipv4.h>
  33#include <linux/etherdevice.h>
  34#include <linux/if_ether.h>
  35
  36#include <net/sock.h>
  37#include <net/ip.h>
  38#include <net/icmp.h>
  39#include <net/protocol.h>
  40#include <net/ipip.h>
  41#include <net/arp.h>
  42#include <net/checksum.h>
  43#include <net/dsfield.h>
  44#include <net/inet_ecn.h>
  45#include <net/xfrm.h>
  46#include <net/net_namespace.h>
  47#include <net/netns/generic.h>
  48#include <net/rtnetlink.h>
  49#include <net/gre.h>
  50
  51#if IS_ENABLED(CONFIG_IPV6)
  52#include <net/ipv6.h>
  53#include <net/ip6_fib.h>
  54#include <net/ip6_route.h>
  55#endif
  56
  57/*
  58   Problems & solutions
  59   --------------------
  60
  61   1. The most important issue is detecting local dead loops.
  62   They would cause complete host lockup in transmit, which
  63   would be "resolved" by stack overflow or, if queueing is enabled,
  64   with infinite looping in net_bh.
  65
  66   We cannot track such dead loops during route installation,
  67   it is infeasible task. The most general solutions would be
  68   to keep skb->encapsulation counter (sort of local ttl),
  69   and silently drop packet when it expires. It is a good
  70   solution, but it supposes maintaining new variable in ALL
  71   skb, even if no tunneling is used.
  72
  73   Current solution: xmit_recursion breaks dead loops. This is a percpu
  74   counter, since when we enter the first ndo_xmit(), cpu migration is
  75   forbidden. We force an exit if this counter reaches RECURSION_LIMIT
  76
  77   2. Networking dead loops would not kill routers, but would really
  78   kill network. IP hop limit plays role of "t->recursion" in this case,
  79   if we copy it from packet being encapsulated to upper header.
  80   It is very good solution, but it introduces two problems:
  81
  82   - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
  83     do not work over tunnels.
  84   - traceroute does not work. I planned to relay ICMP from tunnel,
  85     so that this problem would be solved and traceroute output
  86     would even more informative. This idea appeared to be wrong:
  87     only Linux complies to rfc1812 now (yes, guys, Linux is the only
  88     true router now :-)), all routers (at least, in neighbourhood of mine)
  89     return only 8 bytes of payload. It is the end.
  90
  91   Hence, if we want that OSPF worked or traceroute said something reasonable,
  92   we should search for another solution.
  93
  94   One of them is to parse packet trying to detect inner encapsulation
  95   made by our node. It is difficult or even impossible, especially,
  96   taking into account fragmentation. TO be short, ttl is not solution at all.
  97
  98   Current solution: The solution was UNEXPECTEDLY SIMPLE.
  99   We force DF flag on tunnels with preconfigured hop limit,
 100   that is ALL. :-) Well, it does not remove the problem completely,
 101   but exponential growth of network traffic is changed to linear
 102   (branches, that exceed pmtu are pruned) and tunnel mtu
 103   rapidly degrades to value <68, where looping stops.
 104   Yes, it is not good if there exists a router in the loop,
 105   which does not force DF, even when encapsulating packets have DF set.
 106   But it is not our problem! Nobody could accuse us, we made
 107   all that we could make. Even if it is your gated who injected
 108   fatal route to network, even if it were you who configured
 109   fatal static route: you are innocent. :-)
 110
 111
 112
 113   3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
 114   practically identical code. It would be good to glue them
 115   together, but it is not very evident, how to make them modular.
 116   sit is integral part of IPv6, ipip and gre are naturally modular.
 117   We could extract common parts (hash table, ioctl etc)
 118   to a separate module (ip_tunnel.c).
 119
 120   Alexey Kuznetsov.
 121 */
 122
 123static bool log_ecn_error = true;
 124module_param(log_ecn_error, bool, 0644);
 125MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 126
 127static struct rtnl_link_ops ipgre_link_ops __read_mostly;
 128static int ipgre_tunnel_init(struct net_device *dev);
 129static void ipgre_tunnel_setup(struct net_device *dev);
 130static int ipgre_tunnel_bind_dev(struct net_device *dev);
 131
 132/* Fallback tunnel: no source, no destination, no key, no options */
 133
 134#define HASH_SIZE  16
 135
 136static int ipgre_net_id __read_mostly;
 137struct ipgre_net {
 138        struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
 139
 140        struct net_device *fb_tunnel_dev;
 141};
 142
 143/* Tunnel hash table */
 144
 145/*
 146   4 hash tables:
 147
 148   3: (remote,local)
 149   2: (remote,*)
 150   1: (*,local)
 151   0: (*,*)
 152
 153   We require exact key match i.e. if a key is present in packet
 154   it will match only tunnel with the same key; if it is not present,
 155   it will match only keyless tunnel.
 156
 157   All keysless packets, if not matched configured keyless tunnels
 158   will match fallback tunnel.
 159 */
 160
 161#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
 162
 163#define tunnels_r_l     tunnels[3]
 164#define tunnels_r       tunnels[2]
 165#define tunnels_l       tunnels[1]
 166#define tunnels_wc      tunnels[0]
 167/*
 168 * Locking : hash tables are protected by RCU and RTNL
 169 */
 170
 171#define for_each_ip_tunnel_rcu(start) \
 172        for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
 173
 174/* often modified stats are per cpu, other are shared (netdev->stats) */
 175struct pcpu_tstats {
 176        u64     rx_packets;
 177        u64     rx_bytes;
 178        u64     tx_packets;
 179        u64     tx_bytes;
 180        struct u64_stats_sync   syncp;
 181};
 182
 183static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
 184                                                   struct rtnl_link_stats64 *tot)
 185{
 186        int i;
 187
 188        for_each_possible_cpu(i) {
 189                const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
 190                u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
 191                unsigned int start;
 192
 193                do {
 194                        start = u64_stats_fetch_begin_bh(&tstats->syncp);
 195                        rx_packets = tstats->rx_packets;
 196                        tx_packets = tstats->tx_packets;
 197                        rx_bytes = tstats->rx_bytes;
 198                        tx_bytes = tstats->tx_bytes;
 199                } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
 200
 201                tot->rx_packets += rx_packets;
 202                tot->tx_packets += tx_packets;
 203                tot->rx_bytes   += rx_bytes;
 204                tot->tx_bytes   += tx_bytes;
 205        }
 206
 207        tot->multicast = dev->stats.multicast;
 208        tot->rx_crc_errors = dev->stats.rx_crc_errors;
 209        tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
 210        tot->rx_length_errors = dev->stats.rx_length_errors;
 211        tot->rx_frame_errors = dev->stats.rx_frame_errors;
 212        tot->rx_errors = dev->stats.rx_errors;
 213
 214        tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
 215        tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
 216        tot->tx_dropped = dev->stats.tx_dropped;
 217        tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
 218        tot->tx_errors = dev->stats.tx_errors;
 219
 220        return tot;
 221}
 222
 223/* Does key in tunnel parameters match packet */
 224static bool ipgre_key_match(const struct ip_tunnel_parm *p,
 225                            __be16 flags, __be32 key)
 226{
 227        if (p->i_flags & GRE_KEY) {
 228                if (flags & GRE_KEY)
 229                        return key == p->i_key;
 230                else
 231                        return false;   /* key expected, none present */
 232        } else
 233                return !(flags & GRE_KEY);
 234}
 235
 236/* Given src, dst and key, find appropriate for input tunnel. */
 237
 238static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
 239                                             __be32 remote, __be32 local,
 240                                             __be16 flags, __be32 key,
 241                                             __be16 gre_proto)
 242{
 243        struct net *net = dev_net(dev);
 244        int link = dev->ifindex;
 245        unsigned int h0 = HASH(remote);
 246        unsigned int h1 = HASH(key);
 247        struct ip_tunnel *t, *cand = NULL;
 248        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 249        int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
 250                       ARPHRD_ETHER : ARPHRD_IPGRE;
 251        int score, cand_score = 4;
 252
 253        for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
 254                if (local != t->parms.iph.saddr ||
 255                    remote != t->parms.iph.daddr ||
 256                    !(t->dev->flags & IFF_UP))
 257                        continue;
 258
 259                if (!ipgre_key_match(&t->parms, flags, key))
 260                        continue;
 261
 262                if (t->dev->type != ARPHRD_IPGRE &&
 263                    t->dev->type != dev_type)
 264                        continue;
 265
 266                score = 0;
 267                if (t->parms.link != link)
 268                        score |= 1;
 269                if (t->dev->type != dev_type)
 270                        score |= 2;
 271                if (score == 0)
 272                        return t;
 273
 274                if (score < cand_score) {
 275                        cand = t;
 276                        cand_score = score;
 277                }
 278        }
 279
 280        for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
 281                if (remote != t->parms.iph.daddr ||
 282                    !(t->dev->flags & IFF_UP))
 283                        continue;
 284
 285                if (!ipgre_key_match(&t->parms, flags, key))
 286                        continue;
 287
 288                if (t->dev->type != ARPHRD_IPGRE &&
 289                    t->dev->type != dev_type)
 290                        continue;
 291
 292                score = 0;
 293                if (t->parms.link != link)
 294                        score |= 1;
 295                if (t->dev->type != dev_type)
 296                        score |= 2;
 297                if (score == 0)
 298                        return t;
 299
 300                if (score < cand_score) {
 301                        cand = t;
 302                        cand_score = score;
 303                }
 304        }
 305
 306        for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
 307                if ((local != t->parms.iph.saddr &&
 308                     (local != t->parms.iph.daddr ||
 309                      !ipv4_is_multicast(local))) ||
 310                    !(t->dev->flags & IFF_UP))
 311                        continue;
 312
 313                if (!ipgre_key_match(&t->parms, flags, key))
 314                        continue;
 315
 316                if (t->dev->type != ARPHRD_IPGRE &&
 317                    t->dev->type != dev_type)
 318                        continue;
 319
 320                score = 0;
 321                if (t->parms.link != link)
 322                        score |= 1;
 323                if (t->dev->type != dev_type)
 324                        score |= 2;
 325                if (score == 0)
 326                        return t;
 327
 328                if (score < cand_score) {
 329                        cand = t;
 330                        cand_score = score;
 331                }
 332        }
 333
 334        for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
 335                if (t->parms.i_key != key ||
 336                    !(t->dev->flags & IFF_UP))
 337                        continue;
 338
 339                if (t->dev->type != ARPHRD_IPGRE &&
 340                    t->dev->type != dev_type)
 341                        continue;
 342
 343                score = 0;
 344                if (t->parms.link != link)
 345                        score |= 1;
 346                if (t->dev->type != dev_type)
 347                        score |= 2;
 348                if (score == 0)
 349                        return t;
 350
 351                if (score < cand_score) {
 352                        cand = t;
 353                        cand_score = score;
 354                }
 355        }
 356
 357        if (cand != NULL)
 358                return cand;
 359
 360        dev = ign->fb_tunnel_dev;
 361        if (dev->flags & IFF_UP)
 362                return netdev_priv(dev);
 363
 364        return NULL;
 365}
 366
 367static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
 368                struct ip_tunnel_parm *parms)
 369{
 370        __be32 remote = parms->iph.daddr;
 371        __be32 local = parms->iph.saddr;
 372        __be32 key = parms->i_key;
 373        unsigned int h = HASH(key);
 374        int prio = 0;
 375
 376        if (local)
 377                prio |= 1;
 378        if (remote && !ipv4_is_multicast(remote)) {
 379                prio |= 2;
 380                h ^= HASH(remote);
 381        }
 382
 383        return &ign->tunnels[prio][h];
 384}
 385
 386static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
 387                struct ip_tunnel *t)
 388{
 389        return __ipgre_bucket(ign, &t->parms);
 390}
 391
 392static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
 393{
 394        struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
 395
 396        rcu_assign_pointer(t->next, rtnl_dereference(*tp));
 397        rcu_assign_pointer(*tp, t);
 398}
 399
 400static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
 401{
 402        struct ip_tunnel __rcu **tp;
 403        struct ip_tunnel *iter;
 404
 405        for (tp = ipgre_bucket(ign, t);
 406             (iter = rtnl_dereference(*tp)) != NULL;
 407             tp = &iter->next) {
 408                if (t == iter) {
 409                        rcu_assign_pointer(*tp, t->next);
 410                        break;
 411                }
 412        }
 413}
 414
 415static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
 416                                           struct ip_tunnel_parm *parms,
 417                                           int type)
 418{
 419        __be32 remote = parms->iph.daddr;
 420        __be32 local = parms->iph.saddr;
 421        __be32 key = parms->i_key;
 422        int link = parms->link;
 423        struct ip_tunnel *t;
 424        struct ip_tunnel __rcu **tp;
 425        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 426
 427        for (tp = __ipgre_bucket(ign, parms);
 428             (t = rtnl_dereference(*tp)) != NULL;
 429             tp = &t->next)
 430                if (local == t->parms.iph.saddr &&
 431                    remote == t->parms.iph.daddr &&
 432                    key == t->parms.i_key &&
 433                    link == t->parms.link &&
 434                    type == t->dev->type)
 435                        break;
 436
 437        return t;
 438}
 439
 440static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
 441                struct ip_tunnel_parm *parms, int create)
 442{
 443        struct ip_tunnel *t, *nt;
 444        struct net_device *dev;
 445        char name[IFNAMSIZ];
 446        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 447
 448        t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
 449        if (t || !create)
 450                return t;
 451
 452        if (parms->name[0])
 453                strlcpy(name, parms->name, IFNAMSIZ);
 454        else
 455                strcpy(name, "gre%d");
 456
 457        dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
 458        if (!dev)
 459                return NULL;
 460
 461        dev_net_set(dev, net);
 462
 463        nt = netdev_priv(dev);
 464        nt->parms = *parms;
 465        dev->rtnl_link_ops = &ipgre_link_ops;
 466
 467        dev->mtu = ipgre_tunnel_bind_dev(dev);
 468
 469        if (register_netdevice(dev) < 0)
 470                goto failed_free;
 471
 472        /* Can use a lockless transmit, unless we generate output sequences */
 473        if (!(nt->parms.o_flags & GRE_SEQ))
 474                dev->features |= NETIF_F_LLTX;
 475
 476        dev_hold(dev);
 477        ipgre_tunnel_link(ign, nt);
 478        return nt;
 479
 480failed_free:
 481        free_netdev(dev);
 482        return NULL;
 483}
 484
 485static void ipgre_tunnel_uninit(struct net_device *dev)
 486{
 487        struct net *net = dev_net(dev);
 488        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 489
 490        ipgre_tunnel_unlink(ign, netdev_priv(dev));
 491        dev_put(dev);
 492}
 493
 494
 495static void ipgre_err(struct sk_buff *skb, u32 info)
 496{
 497
 498/* All the routers (except for Linux) return only
 499   8 bytes of packet payload. It means, that precise relaying of
 500   ICMP in the real Internet is absolutely infeasible.
 501
 502   Moreover, Cisco "wise men" put GRE key to the third word
 503   in GRE header. It makes impossible maintaining even soft state for keyed
 504   GRE tunnels with enabled checksum. Tell them "thank you".
 505
 506   Well, I wonder, rfc1812 was written by Cisco employee,
 507   what the hell these idiots break standards established
 508   by themselves???
 509 */
 510
 511        const struct iphdr *iph = (const struct iphdr *)skb->data;
 512        __be16       *p = (__be16 *)(skb->data+(iph->ihl<<2));
 513        int grehlen = (iph->ihl<<2) + 4;
 514        const int type = icmp_hdr(skb)->type;
 515        const int code = icmp_hdr(skb)->code;
 516        struct ip_tunnel *t;
 517        __be16 flags;
 518        __be32 key = 0;
 519
 520        flags = p[0];
 521        if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
 522                if (flags&(GRE_VERSION|GRE_ROUTING))
 523                        return;
 524                if (flags&GRE_KEY) {
 525                        grehlen += 4;
 526                        if (flags&GRE_CSUM)
 527                                grehlen += 4;
 528                }
 529        }
 530
 531        /* If only 8 bytes returned, keyed message will be dropped here */
 532        if (skb_headlen(skb) < grehlen)
 533                return;
 534
 535        if (flags & GRE_KEY)
 536                key = *(((__be32 *)p) + (grehlen / 4) - 1);
 537
 538        switch (type) {
 539        default:
 540        case ICMP_PARAMETERPROB:
 541                return;
 542
 543        case ICMP_DEST_UNREACH:
 544                switch (code) {
 545                case ICMP_SR_FAILED:
 546                case ICMP_PORT_UNREACH:
 547                        /* Impossible event. */
 548                        return;
 549                default:
 550                        /* All others are translated to HOST_UNREACH.
 551                           rfc2003 contains "deep thoughts" about NET_UNREACH,
 552                           I believe they are just ether pollution. --ANK
 553                         */
 554                        break;
 555                }
 556                break;
 557        case ICMP_TIME_EXCEEDED:
 558                if (code != ICMP_EXC_TTL)
 559                        return;
 560                break;
 561
 562        case ICMP_REDIRECT:
 563                break;
 564        }
 565
 566        t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
 567                                flags, key, p[1]);
 568
 569        if (t == NULL)
 570                return;
 571
 572        if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
 573                ipv4_update_pmtu(skb, dev_net(skb->dev), info,
 574                                 t->parms.link, 0, IPPROTO_GRE, 0);
 575                return;
 576        }
 577        if (type == ICMP_REDIRECT) {
 578                ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
 579                              IPPROTO_GRE, 0);
 580                return;
 581        }
 582        if (t->parms.iph.daddr == 0 ||
 583            ipv4_is_multicast(t->parms.iph.daddr))
 584                return;
 585
 586        if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
 587                return;
 588
 589        if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
 590                t->err_count++;
 591        else
 592                t->err_count = 1;
 593        t->err_time = jiffies;
 594}
 595
 596static inline u8
 597ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
 598{
 599        u8 inner = 0;
 600        if (skb->protocol == htons(ETH_P_IP))
 601                inner = old_iph->tos;
 602        else if (skb->protocol == htons(ETH_P_IPV6))
 603                inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
 604        return INET_ECN_encapsulate(tos, inner);
 605}
 606
 607static int ipgre_rcv(struct sk_buff *skb)
 608{
 609        const struct iphdr *iph;
 610        u8     *h;
 611        __be16    flags;
 612        __sum16   csum = 0;
 613        __be32 key = 0;
 614        u32    seqno = 0;
 615        struct ip_tunnel *tunnel;
 616        int    offset = 4;
 617        __be16 gre_proto;
 618        int    err;
 619
 620        if (!pskb_may_pull(skb, 16))
 621                goto drop;
 622
 623        iph = ip_hdr(skb);
 624        h = skb->data;
 625        flags = *(__be16 *)h;
 626
 627        if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
 628                /* - Version must be 0.
 629                   - We do not support routing headers.
 630                 */
 631                if (flags&(GRE_VERSION|GRE_ROUTING))
 632                        goto drop;
 633
 634                if (flags&GRE_CSUM) {
 635                        switch (skb->ip_summed) {
 636                        case CHECKSUM_COMPLETE:
 637                                csum = csum_fold(skb->csum);
 638                                if (!csum)
 639                                        break;
 640                                /* fall through */
 641                        case CHECKSUM_NONE:
 642                                skb->csum = 0;
 643                                csum = __skb_checksum_complete(skb);
 644                                skb->ip_summed = CHECKSUM_COMPLETE;
 645                        }
 646                        offset += 4;
 647                }
 648                if (flags&GRE_KEY) {
 649                        key = *(__be32 *)(h + offset);
 650                        offset += 4;
 651                }
 652                if (flags&GRE_SEQ) {
 653                        seqno = ntohl(*(__be32 *)(h + offset));
 654                        offset += 4;
 655                }
 656        }
 657
 658        gre_proto = *(__be16 *)(h + 2);
 659
 660        tunnel = ipgre_tunnel_lookup(skb->dev,
 661                                     iph->saddr, iph->daddr, flags, key,
 662                                     gre_proto);
 663        if (tunnel) {
 664                struct pcpu_tstats *tstats;
 665
 666                secpath_reset(skb);
 667
 668                skb->protocol = gre_proto;
 669                /* WCCP version 1 and 2 protocol decoding.
 670                 * - Change protocol to IP
 671                 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
 672                 */
 673                if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
 674                        skb->protocol = htons(ETH_P_IP);
 675                        if ((*(h + offset) & 0xF0) != 0x40)
 676                                offset += 4;
 677                }
 678
 679                skb->mac_header = skb->network_header;
 680                __pskb_pull(skb, offset);
 681                skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
 682                skb->pkt_type = PACKET_HOST;
 683#ifdef CONFIG_NET_IPGRE_BROADCAST
 684                if (ipv4_is_multicast(iph->daddr)) {
 685                        /* Looped back packet, drop it! */
 686                        if (rt_is_output_route(skb_rtable(skb)))
 687                                goto drop;
 688                        tunnel->dev->stats.multicast++;
 689                        skb->pkt_type = PACKET_BROADCAST;
 690                }
 691#endif
 692
 693                if (((flags&GRE_CSUM) && csum) ||
 694                    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
 695                        tunnel->dev->stats.rx_crc_errors++;
 696                        tunnel->dev->stats.rx_errors++;
 697                        goto drop;
 698                }
 699                if (tunnel->parms.i_flags&GRE_SEQ) {
 700                        if (!(flags&GRE_SEQ) ||
 701                            (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
 702                                tunnel->dev->stats.rx_fifo_errors++;
 703                                tunnel->dev->stats.rx_errors++;
 704                                goto drop;
 705                        }
 706                        tunnel->i_seqno = seqno + 1;
 707                }
 708
 709                /* Warning: All skb pointers will be invalidated! */
 710                if (tunnel->dev->type == ARPHRD_ETHER) {
 711                        if (!pskb_may_pull(skb, ETH_HLEN)) {
 712                                tunnel->dev->stats.rx_length_errors++;
 713                                tunnel->dev->stats.rx_errors++;
 714                                goto drop;
 715                        }
 716
 717                        iph = ip_hdr(skb);
 718                        skb->protocol = eth_type_trans(skb, tunnel->dev);
 719                        skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 720                }
 721
 722                __skb_tunnel_rx(skb, tunnel->dev);
 723
 724                skb_reset_network_header(skb);
 725                err = IP_ECN_decapsulate(iph, skb);
 726                if (unlikely(err)) {
 727                        if (log_ecn_error)
 728                                net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
 729                                                     &iph->saddr, iph->tos);
 730                        if (err > 1) {
 731                                ++tunnel->dev->stats.rx_frame_errors;
 732                                ++tunnel->dev->stats.rx_errors;
 733                                goto drop;
 734                        }
 735                }
 736
 737                tstats = this_cpu_ptr(tunnel->dev->tstats);
 738                u64_stats_update_begin(&tstats->syncp);
 739                tstats->rx_packets++;
 740                tstats->rx_bytes += skb->len;
 741                u64_stats_update_end(&tstats->syncp);
 742
 743                gro_cells_receive(&tunnel->gro_cells, skb);
 744                return 0;
 745        }
 746        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 747
 748drop:
 749        kfree_skb(skb);
 750        return 0;
 751}
 752
 753static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 754{
 755        struct ip_tunnel *tunnel = netdev_priv(dev);
 756        struct pcpu_tstats *tstats;
 757        const struct iphdr  *old_iph = ip_hdr(skb);
 758        const struct iphdr  *tiph;
 759        struct flowi4 fl4;
 760        u8     tos;
 761        __be16 df;
 762        struct rtable *rt;                      /* Route to the other host */
 763        struct net_device *tdev;                /* Device to other host */
 764        struct iphdr  *iph;                     /* Our new IP header */
 765        unsigned int max_headroom;              /* The extra header space needed */
 766        int    gre_hlen;
 767        __be32 dst;
 768        int    mtu;
 769
 770        if (skb->ip_summed == CHECKSUM_PARTIAL &&
 771            skb_checksum_help(skb))
 772                goto tx_error;
 773
 774        if (dev->type == ARPHRD_ETHER)
 775                IPCB(skb)->flags = 0;
 776
 777        if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
 778                gre_hlen = 0;
 779                tiph = (const struct iphdr *)skb->data;
 780        } else {
 781                gre_hlen = tunnel->hlen;
 782                tiph = &tunnel->parms.iph;
 783        }
 784
 785        if ((dst = tiph->daddr) == 0) {
 786                /* NBMA tunnel */
 787
 788                if (skb_dst(skb) == NULL) {
 789                        dev->stats.tx_fifo_errors++;
 790                        goto tx_error;
 791                }
 792
 793                if (skb->protocol == htons(ETH_P_IP)) {
 794                        rt = skb_rtable(skb);
 795                        dst = rt_nexthop(rt, old_iph->daddr);
 796                }
 797#if IS_ENABLED(CONFIG_IPV6)
 798                else if (skb->protocol == htons(ETH_P_IPV6)) {
 799                        const struct in6_addr *addr6;
 800                        struct neighbour *neigh;
 801                        bool do_tx_error_icmp;
 802                        int addr_type;
 803
 804                        neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
 805                        if (neigh == NULL)
 806                                goto tx_error;
 807
 808                        addr6 = (const struct in6_addr *)&neigh->primary_key;
 809                        addr_type = ipv6_addr_type(addr6);
 810
 811                        if (addr_type == IPV6_ADDR_ANY) {
 812                                addr6 = &ipv6_hdr(skb)->daddr;
 813                                addr_type = ipv6_addr_type(addr6);
 814                        }
 815
 816                        if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
 817                                do_tx_error_icmp = true;
 818                        else {
 819                                do_tx_error_icmp = false;
 820                                dst = addr6->s6_addr32[3];
 821                        }
 822                        neigh_release(neigh);
 823                        if (do_tx_error_icmp)
 824                                goto tx_error_icmp;
 825                }
 826#endif
 827                else
 828                        goto tx_error;
 829        }
 830
 831        tos = tiph->tos;
 832        if (tos == 1) {
 833                tos = 0;
 834                if (skb->protocol == htons(ETH_P_IP))
 835                        tos = old_iph->tos;
 836                else if (skb->protocol == htons(ETH_P_IPV6))
 837                        tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
 838        }
 839
 840        rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
 841                                 tunnel->parms.o_key, RT_TOS(tos),
 842                                 tunnel->parms.link);
 843        if (IS_ERR(rt)) {
 844                dev->stats.tx_carrier_errors++;
 845                goto tx_error;
 846        }
 847        tdev = rt->dst.dev;
 848
 849        if (tdev == dev) {
 850                ip_rt_put(rt);
 851                dev->stats.collisions++;
 852                goto tx_error;
 853        }
 854
 855        df = tiph->frag_off;
 856        if (df)
 857                mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
 858        else
 859                mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
 860
 861        if (skb_dst(skb))
 862                skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
 863
 864        if (skb->protocol == htons(ETH_P_IP)) {
 865                df |= (old_iph->frag_off&htons(IP_DF));
 866
 867                if ((old_iph->frag_off&htons(IP_DF)) &&
 868                    mtu < ntohs(old_iph->tot_len)) {
 869                        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
 870                        ip_rt_put(rt);
 871                        goto tx_error;
 872                }
 873        }
 874#if IS_ENABLED(CONFIG_IPV6)
 875        else if (skb->protocol == htons(ETH_P_IPV6)) {
 876                struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
 877
 878                if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
 879                        if ((tunnel->parms.iph.daddr &&
 880                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
 881                            rt6->rt6i_dst.plen == 128) {
 882                                rt6->rt6i_flags |= RTF_MODIFIED;
 883                                dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
 884                        }
 885                }
 886
 887                if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
 888                        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 889                        ip_rt_put(rt);
 890                        goto tx_error;
 891                }
 892        }
 893#endif
 894
 895        if (tunnel->err_count > 0) {
 896                if (time_before(jiffies,
 897                                tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
 898                        tunnel->err_count--;
 899
 900                        dst_link_failure(skb);
 901                } else
 902                        tunnel->err_count = 0;
 903        }
 904
 905        max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
 906
 907        if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
 908            (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
 909                struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
 910                if (max_headroom > dev->needed_headroom)
 911                        dev->needed_headroom = max_headroom;
 912                if (!new_skb) {
 913                        ip_rt_put(rt);
 914                        dev->stats.tx_dropped++;
 915                        dev_kfree_skb(skb);
 916                        return NETDEV_TX_OK;
 917                }
 918                if (skb->sk)
 919                        skb_set_owner_w(new_skb, skb->sk);
 920                dev_kfree_skb(skb);
 921                skb = new_skb;
 922                old_iph = ip_hdr(skb);
 923        }
 924
 925        skb_reset_transport_header(skb);
 926        skb_push(skb, gre_hlen);
 927        skb_reset_network_header(skb);
 928        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 929        IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
 930                              IPSKB_REROUTED);
 931        skb_dst_drop(skb);
 932        skb_dst_set(skb, &rt->dst);
 933
 934        /*
 935         *      Push down and install the IPIP header.
 936         */
 937
 938        iph                     =       ip_hdr(skb);
 939        iph->version            =       4;
 940        iph->ihl                =       sizeof(struct iphdr) >> 2;
 941        iph->frag_off           =       df;
 942        iph->protocol           =       IPPROTO_GRE;
 943        iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
 944        iph->daddr              =       fl4.daddr;
 945        iph->saddr              =       fl4.saddr;
 946
 947        if ((iph->ttl = tiph->ttl) == 0) {
 948                if (skb->protocol == htons(ETH_P_IP))
 949                        iph->ttl = old_iph->ttl;
 950#if IS_ENABLED(CONFIG_IPV6)
 951                else if (skb->protocol == htons(ETH_P_IPV6))
 952                        iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
 953#endif
 954                else
 955                        iph->ttl = ip4_dst_hoplimit(&rt->dst);
 956        }
 957
 958        ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
 959        ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
 960                                   htons(ETH_P_TEB) : skb->protocol;
 961
 962        if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
 963                __be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4);
 964
 965                if (tunnel->parms.o_flags&GRE_SEQ) {
 966                        ++tunnel->o_seqno;
 967                        *ptr = htonl(tunnel->o_seqno);
 968                        ptr--;
 969                }
 970                if (tunnel->parms.o_flags&GRE_KEY) {
 971                        *ptr = tunnel->parms.o_key;
 972                        ptr--;
 973                }
 974                if (tunnel->parms.o_flags&GRE_CSUM) {
 975                        int offset = skb_transport_offset(skb);
 976
 977                        *ptr = 0;
 978                        *(__sum16 *)ptr = csum_fold(skb_checksum(skb, offset,
 979                                                                 skb->len - offset,
 980                                                                 0));
 981                }
 982        }
 983
 984        nf_reset(skb);
 985        tstats = this_cpu_ptr(dev->tstats);
 986        __IPTUNNEL_XMIT(tstats, &dev->stats);
 987        return NETDEV_TX_OK;
 988
 989#if IS_ENABLED(CONFIG_IPV6)
 990tx_error_icmp:
 991        dst_link_failure(skb);
 992#endif
 993tx_error:
 994        dev->stats.tx_errors++;
 995        dev_kfree_skb(skb);
 996        return NETDEV_TX_OK;
 997}
 998
 999static int ipgre_tunnel_bind_dev(struct net_device *dev)
1000{
1001        struct net_device *tdev = NULL;
1002        struct ip_tunnel *tunnel;
1003        const struct iphdr *iph;
1004        int hlen = LL_MAX_HEADER;
1005        int mtu = ETH_DATA_LEN;
1006        int addend = sizeof(struct iphdr) + 4;
1007
1008        tunnel = netdev_priv(dev);
1009        iph = &tunnel->parms.iph;
1010
1011        /* Guess output device to choose reasonable mtu and needed_headroom */
1012
1013        if (iph->daddr) {
1014                struct flowi4 fl4;
1015                struct rtable *rt;
1016
1017                rt = ip_route_output_gre(dev_net(dev), &fl4,
1018                                         iph->daddr, iph->saddr,
1019                                         tunnel->parms.o_key,
1020                                         RT_TOS(iph->tos),
1021                                         tunnel->parms.link);
1022                if (!IS_ERR(rt)) {
1023                        tdev = rt->dst.dev;
1024                        ip_rt_put(rt);
1025                }
1026
1027                if (dev->type != ARPHRD_ETHER)
1028                        dev->flags |= IFF_POINTOPOINT;
1029        }
1030
1031        if (!tdev && tunnel->parms.link)
1032                tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
1033
1034        if (tdev) {
1035                hlen = tdev->hard_header_len + tdev->needed_headroom;
1036                mtu = tdev->mtu;
1037        }
1038        dev->iflink = tunnel->parms.link;
1039
1040        /* Precalculate GRE options length */
1041        if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
1042                if (tunnel->parms.o_flags&GRE_CSUM)
1043                        addend += 4;
1044                if (tunnel->parms.o_flags&GRE_KEY)
1045                        addend += 4;
1046                if (tunnel->parms.o_flags&GRE_SEQ)
1047                        addend += 4;
1048        }
1049        dev->needed_headroom = addend + hlen;
1050        mtu -= dev->hard_header_len + addend;
1051
1052        if (mtu < 68)
1053                mtu = 68;
1054
1055        tunnel->hlen = addend;
1056
1057        return mtu;
1058}
1059
1060static int
1061ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1062{
1063        int err = 0;
1064        struct ip_tunnel_parm p;
1065        struct ip_tunnel *t;
1066        struct net *net = dev_net(dev);
1067        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1068
1069        switch (cmd) {
1070        case SIOCGETTUNNEL:
1071                t = NULL;
1072                if (dev == ign->fb_tunnel_dev) {
1073                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1074                                err = -EFAULT;
1075                                break;
1076                        }
1077                        t = ipgre_tunnel_locate(net, &p, 0);
1078                }
1079                if (t == NULL)
1080                        t = netdev_priv(dev);
1081                memcpy(&p, &t->parms, sizeof(p));
1082                if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1083                        err = -EFAULT;
1084                break;
1085
1086        case SIOCADDTUNNEL:
1087        case SIOCCHGTUNNEL:
1088                err = -EPERM;
1089                if (!capable(CAP_NET_ADMIN))
1090                        goto done;
1091
1092                err = -EFAULT;
1093                if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1094                        goto done;
1095
1096                err = -EINVAL;
1097                if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1098                    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1099                    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1100                        goto done;
1101                if (p.iph.ttl)
1102                        p.iph.frag_off |= htons(IP_DF);
1103
1104                if (!(p.i_flags&GRE_KEY))
1105                        p.i_key = 0;
1106                if (!(p.o_flags&GRE_KEY))
1107                        p.o_key = 0;
1108
1109                t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1110
1111                if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1112                        if (t != NULL) {
1113                                if (t->dev != dev) {
1114                                        err = -EEXIST;
1115                                        break;
1116                                }
1117                        } else {
1118                                unsigned int nflags = 0;
1119
1120                                t = netdev_priv(dev);
1121
1122                                if (ipv4_is_multicast(p.iph.daddr))
1123                                        nflags = IFF_BROADCAST;
1124                                else if (p.iph.daddr)
1125                                        nflags = IFF_POINTOPOINT;
1126
1127                                if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1128                                        err = -EINVAL;
1129                                        break;
1130                                }
1131                                ipgre_tunnel_unlink(ign, t);
1132                                synchronize_net();
1133                                t->parms.iph.saddr = p.iph.saddr;
1134                                t->parms.iph.daddr = p.iph.daddr;
1135                                t->parms.i_key = p.i_key;
1136                                t->parms.o_key = p.o_key;
1137                                memcpy(dev->dev_addr, &p.iph.saddr, 4);
1138                                memcpy(dev->broadcast, &p.iph.daddr, 4);
1139                                ipgre_tunnel_link(ign, t);
1140                                netdev_state_change(dev);
1141                        }
1142                }
1143
1144                if (t) {
1145                        err = 0;
1146                        if (cmd == SIOCCHGTUNNEL) {
1147                                t->parms.iph.ttl = p.iph.ttl;
1148                                t->parms.iph.tos = p.iph.tos;
1149                                t->parms.iph.frag_off = p.iph.frag_off;
1150                                if (t->parms.link != p.link) {
1151                                        t->parms.link = p.link;
1152                                        dev->mtu = ipgre_tunnel_bind_dev(dev);
1153                                        netdev_state_change(dev);
1154                                }
1155                        }
1156                        if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1157                                err = -EFAULT;
1158                } else
1159                        err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1160                break;
1161
1162        case SIOCDELTUNNEL:
1163                err = -EPERM;
1164                if (!capable(CAP_NET_ADMIN))
1165                        goto done;
1166
1167                if (dev == ign->fb_tunnel_dev) {
1168                        err = -EFAULT;
1169                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1170                                goto done;
1171                        err = -ENOENT;
1172                        if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1173                                goto done;
1174                        err = -EPERM;
1175                        if (t == netdev_priv(ign->fb_tunnel_dev))
1176                                goto done;
1177                        dev = t->dev;
1178                }
1179                unregister_netdevice(dev);
1180                err = 0;
1181                break;
1182
1183        default:
1184                err = -EINVAL;
1185        }
1186
1187done:
1188        return err;
1189}
1190
1191static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1192{
1193        struct ip_tunnel *tunnel = netdev_priv(dev);
1194        if (new_mtu < 68 ||
1195            new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1196                return -EINVAL;
1197        dev->mtu = new_mtu;
1198        return 0;
1199}
1200
1201/* Nice toy. Unfortunately, useless in real life :-)
1202   It allows to construct virtual multiprotocol broadcast "LAN"
1203   over the Internet, provided multicast routing is tuned.
1204
1205
1206   I have no idea was this bicycle invented before me,
1207   so that I had to set ARPHRD_IPGRE to a random value.
1208   I have an impression, that Cisco could make something similar,
1209   but this feature is apparently missing in IOS<=11.2(8).
1210
1211   I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1212   with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1213
1214   ping -t 255 224.66.66.66
1215
1216   If nobody answers, mbone does not work.
1217
1218   ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1219   ip addr add 10.66.66.<somewhat>/24 dev Universe
1220   ifconfig Universe up
1221   ifconfig Universe add fe80::<Your_real_addr>/10
1222   ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1223   ftp 10.66.66.66
1224   ...
1225   ftp fec0:6666:6666::193.233.7.65
1226   ...
1227
1228 */
1229
1230static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1231                        unsigned short type,
1232                        const void *daddr, const void *saddr, unsigned int len)
1233{
1234        struct ip_tunnel *t = netdev_priv(dev);
1235        struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1236        __be16 *p = (__be16 *)(iph+1);
1237
1238        memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1239        p[0]            = t->parms.o_flags;
1240        p[1]            = htons(type);
1241
1242        /*
1243         *      Set the source hardware address.
1244         */
1245
1246        if (saddr)
1247                memcpy(&iph->saddr, saddr, 4);
1248        if (daddr)
1249                memcpy(&iph->daddr, daddr, 4);
1250        if (iph->daddr)
1251                return t->hlen;
1252
1253        return -t->hlen;
1254}
1255
1256static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1257{
1258        const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1259        memcpy(haddr, &iph->saddr, 4);
1260        return 4;
1261}
1262
1263static const struct header_ops ipgre_header_ops = {
1264        .create = ipgre_header,
1265        .parse  = ipgre_header_parse,
1266};
1267
1268#ifdef CONFIG_NET_IPGRE_BROADCAST
1269static int ipgre_open(struct net_device *dev)
1270{
1271        struct ip_tunnel *t = netdev_priv(dev);
1272
1273        if (ipv4_is_multicast(t->parms.iph.daddr)) {
1274                struct flowi4 fl4;
1275                struct rtable *rt;
1276
1277                rt = ip_route_output_gre(dev_net(dev), &fl4,
1278                                         t->parms.iph.daddr,
1279                                         t->parms.iph.saddr,
1280                                         t->parms.o_key,
1281                                         RT_TOS(t->parms.iph.tos),
1282                                         t->parms.link);
1283                if (IS_ERR(rt))
1284                        return -EADDRNOTAVAIL;
1285                dev = rt->dst.dev;
1286                ip_rt_put(rt);
1287                if (__in_dev_get_rtnl(dev) == NULL)
1288                        return -EADDRNOTAVAIL;
1289                t->mlink = dev->ifindex;
1290                ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1291        }
1292        return 0;
1293}
1294
1295static int ipgre_close(struct net_device *dev)
1296{
1297        struct ip_tunnel *t = netdev_priv(dev);
1298
1299        if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1300                struct in_device *in_dev;
1301                in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1302                if (in_dev)
1303                        ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1304        }
1305        return 0;
1306}
1307
1308#endif
1309
1310static const struct net_device_ops ipgre_netdev_ops = {
1311        .ndo_init               = ipgre_tunnel_init,
1312        .ndo_uninit             = ipgre_tunnel_uninit,
1313#ifdef CONFIG_NET_IPGRE_BROADCAST
1314        .ndo_open               = ipgre_open,
1315        .ndo_stop               = ipgre_close,
1316#endif
1317        .ndo_start_xmit         = ipgre_tunnel_xmit,
1318        .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1319        .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1320        .ndo_get_stats64        = ipgre_get_stats64,
1321};
1322
1323static void ipgre_dev_free(struct net_device *dev)
1324{
1325        struct ip_tunnel *tunnel = netdev_priv(dev);
1326
1327        gro_cells_destroy(&tunnel->gro_cells);
1328        free_percpu(dev->tstats);
1329        free_netdev(dev);
1330}
1331
1332#define GRE_FEATURES (NETIF_F_SG |              \
1333                      NETIF_F_FRAGLIST |        \
1334                      NETIF_F_HIGHDMA |         \
1335                      NETIF_F_HW_CSUM)
1336
1337static void ipgre_tunnel_setup(struct net_device *dev)
1338{
1339        dev->netdev_ops         = &ipgre_netdev_ops;
1340        dev->destructor         = ipgre_dev_free;
1341
1342        dev->type               = ARPHRD_IPGRE;
1343        dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1344        dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1345        dev->flags              = IFF_NOARP;
1346        dev->iflink             = 0;
1347        dev->addr_len           = 4;
1348        dev->features           |= NETIF_F_NETNS_LOCAL;
1349        dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1350
1351        dev->features           |= GRE_FEATURES;
1352        dev->hw_features        |= GRE_FEATURES;
1353}
1354
1355static int ipgre_tunnel_init(struct net_device *dev)
1356{
1357        struct ip_tunnel *tunnel;
1358        struct iphdr *iph;
1359        int err;
1360
1361        tunnel = netdev_priv(dev);
1362        iph = &tunnel->parms.iph;
1363
1364        tunnel->dev = dev;
1365        strcpy(tunnel->parms.name, dev->name);
1366
1367        memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1368        memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1369
1370        if (iph->daddr) {
1371#ifdef CONFIG_NET_IPGRE_BROADCAST
1372                if (ipv4_is_multicast(iph->daddr)) {
1373                        if (!iph->saddr)
1374                                return -EINVAL;
1375                        dev->flags = IFF_BROADCAST;
1376                        dev->header_ops = &ipgre_header_ops;
1377                }
1378#endif
1379        } else
1380                dev->header_ops = &ipgre_header_ops;
1381
1382        dev->tstats = alloc_percpu(struct pcpu_tstats);
1383        if (!dev->tstats)
1384                return -ENOMEM;
1385
1386        err = gro_cells_init(&tunnel->gro_cells, dev);
1387        if (err) {
1388                free_percpu(dev->tstats);
1389                return err;
1390        }
1391
1392        return 0;
1393}
1394
1395static void ipgre_fb_tunnel_init(struct net_device *dev)
1396{
1397        struct ip_tunnel *tunnel = netdev_priv(dev);
1398        struct iphdr *iph = &tunnel->parms.iph;
1399
1400        tunnel->dev = dev;
1401        strcpy(tunnel->parms.name, dev->name);
1402
1403        iph->version            = 4;
1404        iph->protocol           = IPPROTO_GRE;
1405        iph->ihl                = 5;
1406        tunnel->hlen            = sizeof(struct iphdr) + 4;
1407
1408        dev_hold(dev);
1409}
1410
1411
1412static const struct gre_protocol ipgre_protocol = {
1413        .handler     = ipgre_rcv,
1414        .err_handler = ipgre_err,
1415};
1416
1417static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1418{
1419        int prio;
1420
1421        for (prio = 0; prio < 4; prio++) {
1422                int h;
1423                for (h = 0; h < HASH_SIZE; h++) {
1424                        struct ip_tunnel *t;
1425
1426                        t = rtnl_dereference(ign->tunnels[prio][h]);
1427
1428                        while (t != NULL) {
1429                                unregister_netdevice_queue(t->dev, head);
1430                                t = rtnl_dereference(t->next);
1431                        }
1432                }
1433        }
1434}
1435
1436static int __net_init ipgre_init_net(struct net *net)
1437{
1438        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1439        int err;
1440
1441        ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1442                                           ipgre_tunnel_setup);
1443        if (!ign->fb_tunnel_dev) {
1444                err = -ENOMEM;
1445                goto err_alloc_dev;
1446        }
1447        dev_net_set(ign->fb_tunnel_dev, net);
1448
1449        ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1450        ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1451
1452        if ((err = register_netdev(ign->fb_tunnel_dev)))
1453                goto err_reg_dev;
1454
1455        rcu_assign_pointer(ign->tunnels_wc[0],
1456                           netdev_priv(ign->fb_tunnel_dev));
1457        return 0;
1458
1459err_reg_dev:
1460        ipgre_dev_free(ign->fb_tunnel_dev);
1461err_alloc_dev:
1462        return err;
1463}
1464
1465static void __net_exit ipgre_exit_net(struct net *net)
1466{
1467        struct ipgre_net *ign;
1468        LIST_HEAD(list);
1469
1470        ign = net_generic(net, ipgre_net_id);
1471        rtnl_lock();
1472        ipgre_destroy_tunnels(ign, &list);
1473        unregister_netdevice_many(&list);
1474        rtnl_unlock();
1475}
1476
1477static struct pernet_operations ipgre_net_ops = {
1478        .init = ipgre_init_net,
1479        .exit = ipgre_exit_net,
1480        .id   = &ipgre_net_id,
1481        .size = sizeof(struct ipgre_net),
1482};
1483
1484static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1485{
1486        __be16 flags;
1487
1488        if (!data)
1489                return 0;
1490
1491        flags = 0;
1492        if (data[IFLA_GRE_IFLAGS])
1493                flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1494        if (data[IFLA_GRE_OFLAGS])
1495                flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1496        if (flags & (GRE_VERSION|GRE_ROUTING))
1497                return -EINVAL;
1498
1499        return 0;
1500}
1501
1502static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1503{
1504        __be32 daddr;
1505
1506        if (tb[IFLA_ADDRESS]) {
1507                if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1508                        return -EINVAL;
1509                if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1510                        return -EADDRNOTAVAIL;
1511        }
1512
1513        if (!data)
1514                goto out;
1515
1516        if (data[IFLA_GRE_REMOTE]) {
1517                memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1518                if (!daddr)
1519                        return -EINVAL;
1520        }
1521
1522out:
1523        return ipgre_tunnel_validate(tb, data);
1524}
1525
1526static void ipgre_netlink_parms(struct nlattr *data[],
1527                                struct ip_tunnel_parm *parms)
1528{
1529        memset(parms, 0, sizeof(*parms));
1530
1531        parms->iph.protocol = IPPROTO_GRE;
1532
1533        if (!data)
1534                return;
1535
1536        if (data[IFLA_GRE_LINK])
1537                parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1538
1539        if (data[IFLA_GRE_IFLAGS])
1540                parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1541
1542        if (data[IFLA_GRE_OFLAGS])
1543                parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1544
1545        if (data[IFLA_GRE_IKEY])
1546                parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1547
1548        if (data[IFLA_GRE_OKEY])
1549                parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1550
1551        if (data[IFLA_GRE_LOCAL])
1552                parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1553
1554        if (data[IFLA_GRE_REMOTE])
1555                parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1556
1557        if (data[IFLA_GRE_TTL])
1558                parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1559
1560        if (data[IFLA_GRE_TOS])
1561                parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1562
1563        if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1564                parms->iph.frag_off = htons(IP_DF);
1565}
1566
1567static int ipgre_tap_init(struct net_device *dev)
1568{
1569        struct ip_tunnel *tunnel;
1570
1571        tunnel = netdev_priv(dev);
1572
1573        tunnel->dev = dev;
1574        strcpy(tunnel->parms.name, dev->name);
1575
1576        ipgre_tunnel_bind_dev(dev);
1577
1578        dev->tstats = alloc_percpu(struct pcpu_tstats);
1579        if (!dev->tstats)
1580                return -ENOMEM;
1581
1582        return 0;
1583}
1584
1585static const struct net_device_ops ipgre_tap_netdev_ops = {
1586        .ndo_init               = ipgre_tap_init,
1587        .ndo_uninit             = ipgre_tunnel_uninit,
1588        .ndo_start_xmit         = ipgre_tunnel_xmit,
1589        .ndo_set_mac_address    = eth_mac_addr,
1590        .ndo_validate_addr      = eth_validate_addr,
1591        .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1592        .ndo_get_stats64        = ipgre_get_stats64,
1593};
1594
1595static void ipgre_tap_setup(struct net_device *dev)
1596{
1597
1598        ether_setup(dev);
1599
1600        dev->netdev_ops         = &ipgre_tap_netdev_ops;
1601        dev->destructor         = ipgre_dev_free;
1602
1603        dev->iflink             = 0;
1604        dev->features           |= NETIF_F_NETNS_LOCAL;
1605}
1606
1607static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1608                         struct nlattr *data[])
1609{
1610        struct ip_tunnel *nt;
1611        struct net *net = dev_net(dev);
1612        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1613        int mtu;
1614        int err;
1615
1616        nt = netdev_priv(dev);
1617        ipgre_netlink_parms(data, &nt->parms);
1618
1619        if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1620                return -EEXIST;
1621
1622        if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1623                eth_hw_addr_random(dev);
1624
1625        mtu = ipgre_tunnel_bind_dev(dev);
1626        if (!tb[IFLA_MTU])
1627                dev->mtu = mtu;
1628
1629        /* Can use a lockless transmit, unless we generate output sequences */
1630        if (!(nt->parms.o_flags & GRE_SEQ))
1631                dev->features |= NETIF_F_LLTX;
1632
1633        err = register_netdevice(dev);
1634        if (err)
1635                goto out;
1636
1637        dev_hold(dev);
1638        ipgre_tunnel_link(ign, nt);
1639
1640out:
1641        return err;
1642}
1643
1644static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1645                            struct nlattr *data[])
1646{
1647        struct ip_tunnel *t, *nt;
1648        struct net *net = dev_net(dev);
1649        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1650        struct ip_tunnel_parm p;
1651        int mtu;
1652
1653        if (dev == ign->fb_tunnel_dev)
1654                return -EINVAL;
1655
1656        nt = netdev_priv(dev);
1657        ipgre_netlink_parms(data, &p);
1658
1659        t = ipgre_tunnel_locate(net, &p, 0);
1660
1661        if (t) {
1662                if (t->dev != dev)
1663                        return -EEXIST;
1664        } else {
1665                t = nt;
1666
1667                if (dev->type != ARPHRD_ETHER) {
1668                        unsigned int nflags = 0;
1669
1670                        if (ipv4_is_multicast(p.iph.daddr))
1671                                nflags = IFF_BROADCAST;
1672                        else if (p.iph.daddr)
1673                                nflags = IFF_POINTOPOINT;
1674
1675                        if ((dev->flags ^ nflags) &
1676                            (IFF_POINTOPOINT | IFF_BROADCAST))
1677                                return -EINVAL;
1678                }
1679
1680                ipgre_tunnel_unlink(ign, t);
1681                t->parms.iph.saddr = p.iph.saddr;
1682                t->parms.iph.daddr = p.iph.daddr;
1683                t->parms.i_key = p.i_key;
1684                if (dev->type != ARPHRD_ETHER) {
1685                        memcpy(dev->dev_addr, &p.iph.saddr, 4);
1686                        memcpy(dev->broadcast, &p.iph.daddr, 4);
1687                }
1688                ipgre_tunnel_link(ign, t);
1689                netdev_state_change(dev);
1690        }
1691
1692        t->parms.o_key = p.o_key;
1693        t->parms.iph.ttl = p.iph.ttl;
1694        t->parms.iph.tos = p.iph.tos;
1695        t->parms.iph.frag_off = p.iph.frag_off;
1696
1697        if (t->parms.link != p.link) {
1698                t->parms.link = p.link;
1699                mtu = ipgre_tunnel_bind_dev(dev);
1700                if (!tb[IFLA_MTU])
1701                        dev->mtu = mtu;
1702                netdev_state_change(dev);
1703        }
1704
1705        return 0;
1706}
1707
1708static size_t ipgre_get_size(const struct net_device *dev)
1709{
1710        return
1711                /* IFLA_GRE_LINK */
1712                nla_total_size(4) +
1713                /* IFLA_GRE_IFLAGS */
1714                nla_total_size(2) +
1715                /* IFLA_GRE_OFLAGS */
1716                nla_total_size(2) +
1717                /* IFLA_GRE_IKEY */
1718                nla_total_size(4) +
1719                /* IFLA_GRE_OKEY */
1720                nla_total_size(4) +
1721                /* IFLA_GRE_LOCAL */
1722                nla_total_size(4) +
1723                /* IFLA_GRE_REMOTE */
1724                nla_total_size(4) +
1725                /* IFLA_GRE_TTL */
1726                nla_total_size(1) +
1727                /* IFLA_GRE_TOS */
1728                nla_total_size(1) +
1729                /* IFLA_GRE_PMTUDISC */
1730                nla_total_size(1) +
1731                0;
1732}
1733
1734static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1735{
1736        struct ip_tunnel *t = netdev_priv(dev);
1737        struct ip_tunnel_parm *p = &t->parms;
1738
1739        if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1740            nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) ||
1741            nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) ||
1742            nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1743            nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1744            nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1745            nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1746            nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1747            nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1748            nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1749                       !!(p->iph.frag_off & htons(IP_DF))))
1750                goto nla_put_failure;
1751        return 0;
1752
1753nla_put_failure:
1754        return -EMSGSIZE;
1755}
1756
1757static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1758        [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1759        [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1760        [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1761        [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1762        [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1763        [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1764        [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1765        [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1766        [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1767        [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1768};
1769
1770static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1771        .kind           = "gre",
1772        .maxtype        = IFLA_GRE_MAX,
1773        .policy         = ipgre_policy,
1774        .priv_size      = sizeof(struct ip_tunnel),
1775        .setup          = ipgre_tunnel_setup,
1776        .validate       = ipgre_tunnel_validate,
1777        .newlink        = ipgre_newlink,
1778        .changelink     = ipgre_changelink,
1779        .get_size       = ipgre_get_size,
1780        .fill_info      = ipgre_fill_info,
1781};
1782
1783static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1784        .kind           = "gretap",
1785        .maxtype        = IFLA_GRE_MAX,
1786        .policy         = ipgre_policy,
1787        .priv_size      = sizeof(struct ip_tunnel),
1788        .setup          = ipgre_tap_setup,
1789        .validate       = ipgre_tap_validate,
1790        .newlink        = ipgre_newlink,
1791        .changelink     = ipgre_changelink,
1792        .get_size       = ipgre_get_size,
1793        .fill_info      = ipgre_fill_info,
1794};
1795
1796/*
1797 *      And now the modules code and kernel interface.
1798 */
1799
1800static int __init ipgre_init(void)
1801{
1802        int err;
1803
1804        pr_info("GRE over IPv4 tunneling driver\n");
1805
1806        err = register_pernet_device(&ipgre_net_ops);
1807        if (err < 0)
1808                return err;
1809
1810        err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1811        if (err < 0) {
1812                pr_info("%s: can't add protocol\n", __func__);
1813                goto add_proto_failed;
1814        }
1815
1816        err = rtnl_link_register(&ipgre_link_ops);
1817        if (err < 0)
1818                goto rtnl_link_failed;
1819
1820        err = rtnl_link_register(&ipgre_tap_ops);
1821        if (err < 0)
1822                goto tap_ops_failed;
1823
1824out:
1825        return err;
1826
1827tap_ops_failed:
1828        rtnl_link_unregister(&ipgre_link_ops);
1829rtnl_link_failed:
1830        gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1831add_proto_failed:
1832        unregister_pernet_device(&ipgre_net_ops);
1833        goto out;
1834}
1835
1836static void __exit ipgre_fini(void)
1837{
1838        rtnl_link_unregister(&ipgre_tap_ops);
1839        rtnl_link_unregister(&ipgre_link_ops);
1840        if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1841                pr_info("%s: can't remove protocol\n", __func__);
1842        unregister_pernet_device(&ipgre_net_ops);
1843}
1844
1845module_init(ipgre_init);
1846module_exit(ipgre_fini);
1847MODULE_LICENSE("GPL");
1848MODULE_ALIAS_RTNL_LINK("gre");
1849MODULE_ALIAS_RTNL_LINK("gretap");
1850MODULE_ALIAS_NETDEV("gre0");
1851
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.