linux/net/ipv4/ip_gre.c
<<
>>
Prefs
   1/*
   2 *      Linux NET3:     GRE over IP protocol decoder.
   3 *
   4 *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
   5 *
   6 *      This program is free software; you can redistribute it and/or
   7 *      modify it under the terms of the GNU General Public License
   8 *      as published by the Free Software Foundation; either version
   9 *      2 of the License, or (at your option) any later version.
  10 *
  11 */
  12
  13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  14
  15#include <linux/capability.h>
  16#include <linux/module.h>
  17#include <linux/types.h>
  18#include <linux/kernel.h>
  19#include <linux/slab.h>
  20#include <asm/uaccess.h>
  21#include <linux/skbuff.h>
  22#include <linux/netdevice.h>
  23#include <linux/in.h>
  24#include <linux/tcp.h>
  25#include <linux/udp.h>
  26#include <linux/if_arp.h>
  27#include <linux/mroute.h>
  28#include <linux/init.h>
  29#include <linux/in6.h>
  30#include <linux/inetdevice.h>
  31#include <linux/igmp.h>
  32#include <linux/netfilter_ipv4.h>
  33#include <linux/etherdevice.h>
  34#include <linux/if_ether.h>
  35
  36#include <net/sock.h>
  37#include <net/ip.h>
  38#include <net/icmp.h>
  39#include <net/protocol.h>
  40#include <net/ipip.h>
  41#include <net/arp.h>
  42#include <net/checksum.h>
  43#include <net/dsfield.h>
  44#include <net/inet_ecn.h>
  45#include <net/xfrm.h>
  46#include <net/net_namespace.h>
  47#include <net/netns/generic.h>
  48#include <net/rtnetlink.h>
  49#include <net/gre.h>
  50
  51#if IS_ENABLED(CONFIG_IPV6)
  52#include <net/ipv6.h>
  53#include <net/ip6_fib.h>
  54#include <net/ip6_route.h>
  55#endif
  56
  57/*
  58   Problems & solutions
  59   --------------------
  60
  61   1. The most important issue is detecting local dead loops.
  62   They would cause complete host lockup in transmit, which
  63   would be "resolved" by stack overflow or, if queueing is enabled,
  64   with infinite looping in net_bh.
  65
  66   We cannot track such dead loops during route installation,
  67   it is infeasible task. The most general solutions would be
  68   to keep skb->encapsulation counter (sort of local ttl),
  69   and silently drop packet when it expires. It is a good
  70   solution, but it supposes maintaining new variable in ALL
  71   skb, even if no tunneling is used.
  72
  73   Current solution: xmit_recursion breaks dead loops. This is a percpu
  74   counter, since when we enter the first ndo_xmit(), cpu migration is
  75   forbidden. We force an exit if this counter reaches RECURSION_LIMIT
  76
  77   2. Networking dead loops would not kill routers, but would really
  78   kill network. IP hop limit plays role of "t->recursion" in this case,
  79   if we copy it from packet being encapsulated to upper header.
  80   It is very good solution, but it introduces two problems:
  81
  82   - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
  83     do not work over tunnels.
  84   - traceroute does not work. I planned to relay ICMP from tunnel,
  85     so that this problem would be solved and traceroute output
  86     would even more informative. This idea appeared to be wrong:
  87     only Linux complies to rfc1812 now (yes, guys, Linux is the only
  88     true router now :-)), all routers (at least, in neighbourhood of mine)
  89     return only 8 bytes of payload. It is the end.
  90
  91   Hence, if we want that OSPF worked or traceroute said something reasonable,
  92   we should search for another solution.
  93
  94   One of them is to parse packet trying to detect inner encapsulation
  95   made by our node. It is difficult or even impossible, especially,
  96   taking into account fragmentation. TO be short, ttl is not solution at all.
  97
  98   Current solution: The solution was UNEXPECTEDLY SIMPLE.
  99   We force DF flag on tunnels with preconfigured hop limit,
 100   that is ALL. :-) Well, it does not remove the problem completely,
 101   but exponential growth of network traffic is changed to linear
 102   (branches, that exceed pmtu are pruned) and tunnel mtu
 103   rapidly degrades to value <68, where looping stops.
 104   Yes, it is not good if there exists a router in the loop,
 105   which does not force DF, even when encapsulating packets have DF set.
 106   But it is not our problem! Nobody could accuse us, we made
 107   all that we could make. Even if it is your gated who injected
 108   fatal route to network, even if it were you who configured
 109   fatal static route: you are innocent. :-)
 110
 111
 112
 113   3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
 114   practically identical code. It would be good to glue them
 115   together, but it is not very evident, how to make them modular.
 116   sit is integral part of IPv6, ipip and gre are naturally modular.
 117   We could extract common parts (hash table, ioctl etc)
 118   to a separate module (ip_tunnel.c).
 119
 120   Alexey Kuznetsov.
 121 */
 122
 123static bool log_ecn_error = true;
 124module_param(log_ecn_error, bool, 0644);
 125MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 126
 127static struct rtnl_link_ops ipgre_link_ops __read_mostly;
 128static int ipgre_tunnel_init(struct net_device *dev);
 129static void ipgre_tunnel_setup(struct net_device *dev);
 130static int ipgre_tunnel_bind_dev(struct net_device *dev);
 131
 132/* Fallback tunnel: no source, no destination, no key, no options */
 133
 134#define HASH_SIZE  16
 135
 136static int ipgre_net_id __read_mostly;
 137struct ipgre_net {
 138        struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
 139
 140        struct net_device *fb_tunnel_dev;
 141};
 142
 143/* Tunnel hash table */
 144
 145/*
 146   4 hash tables:
 147
 148   3: (remote,local)
 149   2: (remote,*)
 150   1: (*,local)
 151   0: (*,*)
 152
 153   We require exact key match i.e. if a key is present in packet
 154   it will match only tunnel with the same key; if it is not present,
 155   it will match only keyless tunnel.
 156
 157   All keysless packets, if not matched configured keyless tunnels
 158   will match fallback tunnel.
 159 */
 160
 161#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
 162
 163#define tunnels_r_l     tunnels[3]
 164#define tunnels_r       tunnels[2]
 165#define tunnels_l       tunnels[1]
 166#define tunnels_wc      tunnels[0]
 167/*
 168 * Locking : hash tables are protected by RCU and RTNL
 169 */
 170
 171#define for_each_ip_tunnel_rcu(start) \
 172        for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
 173
 174/* often modified stats are per cpu, other are shared (netdev->stats) */
 175struct pcpu_tstats {
 176        u64     rx_packets;
 177        u64     rx_bytes;
 178        u64     tx_packets;
 179        u64     tx_bytes;
 180        struct u64_stats_sync   syncp;
 181};
 182
 183static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
 184                                                   struct rtnl_link_stats64 *tot)
 185{
 186        int i;
 187
 188        for_each_possible_cpu(i) {
 189                const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
 190                u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
 191                unsigned int start;
 192
 193                do {
 194                        start = u64_stats_fetch_begin_bh(&tstats->syncp);
 195                        rx_packets = tstats->rx_packets;
 196                        tx_packets = tstats->tx_packets;
 197                        rx_bytes = tstats->rx_bytes;
 198                        tx_bytes = tstats->tx_bytes;
 199                } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
 200
 201                tot->rx_packets += rx_packets;
 202                tot->tx_packets += tx_packets;
 203                tot->rx_bytes   += rx_bytes;
 204                tot->tx_bytes   += tx_bytes;
 205        }
 206
 207        tot->multicast = dev->stats.multicast;
 208        tot->rx_crc_errors = dev->stats.rx_crc_errors;
 209        tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
 210        tot->rx_length_errors = dev->stats.rx_length_errors;
 211        tot->rx_frame_errors = dev->stats.rx_frame_errors;
 212        tot->rx_errors = dev->stats.rx_errors;
 213
 214        tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
 215        tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
 216        tot->tx_dropped = dev->stats.tx_dropped;
 217        tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
 218        tot->tx_errors = dev->stats.tx_errors;
 219
 220        return tot;
 221}
 222
 223/* Does key in tunnel parameters match packet */
 224static bool ipgre_key_match(const struct ip_tunnel_parm *p,
 225                            __be16 flags, __be32 key)
 226{
 227        if (p->i_flags & GRE_KEY) {
 228                if (flags & GRE_KEY)
 229                        return key == p->i_key;
 230                else
 231                        return false;   /* key expected, none present */
 232        } else
 233                return !(flags & GRE_KEY);
 234}
 235
 236/* Given src, dst and key, find appropriate for input tunnel. */
 237
 238static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
 239                                             __be32 remote, __be32 local,
 240                                             __be16 flags, __be32 key,
 241                                             __be16 gre_proto)
 242{
 243        struct net *net = dev_net(dev);
 244        int link = dev->ifindex;
 245        unsigned int h0 = HASH(remote);
 246        unsigned int h1 = HASH(key);
 247        struct ip_tunnel *t, *cand = NULL;
 248        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 249        int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
 250                       ARPHRD_ETHER : ARPHRD_IPGRE;
 251        int score, cand_score = 4;
 252
 253        for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
 254                if (local != t->parms.iph.saddr ||
 255                    remote != t->parms.iph.daddr ||
 256                    !(t->dev->flags & IFF_UP))
 257                        continue;
 258
 259                if (!ipgre_key_match(&t->parms, flags, key))
 260                        continue;
 261
 262                if (t->dev->type != ARPHRD_IPGRE &&
 263                    t->dev->type != dev_type)
 264                        continue;
 265
 266                score = 0;
 267                if (t->parms.link != link)
 268                        score |= 1;
 269                if (t->dev->type != dev_type)
 270                        score |= 2;
 271                if (score == 0)
 272                        return t;
 273
 274                if (score < cand_score) {
 275                        cand = t;
 276                        cand_score = score;
 277                }
 278        }
 279
 280        for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
 281                if (remote != t->parms.iph.daddr ||
 282                    !(t->dev->flags & IFF_UP))
 283                        continue;
 284
 285                if (!ipgre_key_match(&t->parms, flags, key))
 286                        continue;
 287
 288                if (t->dev->type != ARPHRD_IPGRE &&
 289                    t->dev->type != dev_type)
 290                        continue;
 291
 292                score = 0;
 293                if (t->parms.link != link)
 294                        score |= 1;
 295                if (t->dev->type != dev_type)
 296                        score |= 2;
 297                if (score == 0)
 298                        return t;
 299
 300                if (score < cand_score) {
 301                        cand = t;
 302                        cand_score = score;
 303                }
 304        }
 305
 306        for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
 307                if ((local != t->parms.iph.saddr &&
 308                     (local != t->parms.iph.daddr ||
 309                      !ipv4_is_multicast(local))) ||
 310                    !(t->dev->flags & IFF_UP))
 311                        continue;
 312
 313                if (!ipgre_key_match(&t->parms, flags, key))
 314                        continue;
 315
 316                if (t->dev->type != ARPHRD_IPGRE &&
 317                    t->dev->type != dev_type)
 318                        continue;
 319
 320                score = 0;
 321                if (t->parms.link != link)
 322                        score |= 1;
 323                if (t->dev->type != dev_type)
 324                        score |= 2;
 325                if (score == 0)
 326                        return t;
 327
 328                if (score < cand_score) {
 329                        cand = t;
 330                        cand_score = score;
 331                }
 332        }
 333
 334        for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
 335                if (t->parms.i_key != key ||
 336                    !(t->dev->flags & IFF_UP))
 337                        continue;
 338
 339                if (t->dev->type != ARPHRD_IPGRE &&
 340                    t->dev->type != dev_type)
 341                        continue;
 342
 343                score = 0;
 344                if (t->parms.link != link)
 345                        score |= 1;
 346                if (t->dev->type != dev_type)
 347                        score |= 2;
 348                if (score == 0)
 349                        return t;
 350
 351                if (score < cand_score) {
 352                        cand = t;
 353                        cand_score = score;
 354                }
 355        }
 356
 357        if (cand != NULL)
 358                return cand;
 359
 360        dev = ign->fb_tunnel_dev;
 361        if (dev->flags & IFF_UP)
 362                return netdev_priv(dev);
 363
 364        return NULL;
 365}
 366
 367static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
 368                struct ip_tunnel_parm *parms)
 369{
 370        __be32 remote = parms->iph.daddr;
 371        __be32 local = parms->iph.saddr;
 372        __be32 key = parms->i_key;
 373        unsigned int h = HASH(key);
 374        int prio = 0;
 375
 376        if (local)
 377                prio |= 1;
 378        if (remote && !ipv4_is_multicast(remote)) {
 379                prio |= 2;
 380                h ^= HASH(remote);
 381        }
 382
 383        return &ign->tunnels[prio][h];
 384}
 385
 386static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
 387                struct ip_tunnel *t)
 388{
 389        return __ipgre_bucket(ign, &t->parms);
 390}
 391
 392static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
 393{
 394        struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
 395
 396        rcu_assign_pointer(t->next, rtnl_dereference(*tp));
 397        rcu_assign_pointer(*tp, t);
 398}
 399
 400static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
 401{
 402        struct ip_tunnel __rcu **tp;
 403        struct ip_tunnel *iter;
 404
 405        for (tp = ipgre_bucket(ign, t);
 406             (iter = rtnl_dereference(*tp)) != NULL;
 407             tp = &iter->next) {
 408                if (t == iter) {
 409                        rcu_assign_pointer(*tp, t->next);
 410                        break;
 411                }
 412        }
 413}
 414
 415static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
 416                                           struct ip_tunnel_parm *parms,
 417                                           int type)
 418{
 419        __be32 remote = parms->iph.daddr;
 420        __be32 local = parms->iph.saddr;
 421        __be32 key = parms->i_key;
 422        int link = parms->link;
 423        struct ip_tunnel *t;
 424        struct ip_tunnel __rcu **tp;
 425        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 426
 427        for (tp = __ipgre_bucket(ign, parms);
 428             (t = rtnl_dereference(*tp)) != NULL;
 429             tp = &t->next)
 430                if (local == t->parms.iph.saddr &&
 431                    remote == t->parms.iph.daddr &&
 432                    key == t->parms.i_key &&
 433                    link == t->parms.link &&
 434                    type == t->dev->type)
 435                        break;
 436
 437        return t;
 438}
 439
 440static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
 441                struct ip_tunnel_parm *parms, int create)
 442{
 443        struct ip_tunnel *t, *nt;
 444        struct net_device *dev;
 445        char name[IFNAMSIZ];
 446        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 447
 448        t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
 449        if (t || !create)
 450                return t;
 451
 452        if (parms->name[0])
 453                strlcpy(name, parms->name, IFNAMSIZ);
 454        else
 455                strcpy(name, "gre%d");
 456
 457        dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
 458        if (!dev)
 459                return NULL;
 460
 461        dev_net_set(dev, net);
 462
 463        nt = netdev_priv(dev);
 464        nt->parms = *parms;
 465        dev->rtnl_link_ops = &ipgre_link_ops;
 466
 467        dev->mtu = ipgre_tunnel_bind_dev(dev);
 468
 469        if (register_netdevice(dev) < 0)
 470                goto failed_free;
 471
 472        /* Can use a lockless transmit, unless we generate output sequences */
 473        if (!(nt->parms.o_flags & GRE_SEQ))
 474                dev->features |= NETIF_F_LLTX;
 475
 476        dev_hold(dev);
 477        ipgre_tunnel_link(ign, nt);
 478        return nt;
 479
 480failed_free:
 481        free_netdev(dev);
 482        return NULL;
 483}
 484
 485static void ipgre_tunnel_uninit(struct net_device *dev)
 486{
 487        struct net *net = dev_net(dev);
 488        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 489
 490        ipgre_tunnel_unlink(ign, netdev_priv(dev));
 491        dev_put(dev);
 492}
 493
 494
 495static void ipgre_err(struct sk_buff *skb, u32 info)
 496{
 497
 498/* All the routers (except for Linux) return only
 499   8 bytes of packet payload. It means, that precise relaying of
 500   ICMP in the real Internet is absolutely infeasible.
 501
 502   Moreover, Cisco "wise men" put GRE key to the third word
 503   in GRE header. It makes impossible maintaining even soft state for keyed
 504   GRE tunnels with enabled checksum. Tell them "thank you".
 505
 506   Well, I wonder, rfc1812 was written by Cisco employee,
 507   what the hell these idiots break standards established
 508   by themselves???
 509 */
 510
 511        const struct iphdr *iph = (const struct iphdr *)skb->data;
 512        __be16       *p = (__be16 *)(skb->data+(iph->ihl<<2));
 513        int grehlen = (iph->ihl<<2) + 4;
 514        const int type = icmp_hdr(skb)->type;
 515        const int code = icmp_hdr(skb)->code;
 516        struct ip_tunnel *t;
 517        __be16 flags;
 518        __be32 key = 0;
 519
 520        flags = p[0];
 521        if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
 522                if (flags&(GRE_VERSION|GRE_ROUTING))
 523                        return;
 524                if (flags&GRE_KEY) {
 525                        grehlen += 4;
 526                        if (flags&GRE_CSUM)
 527                                grehlen += 4;
 528                }
 529        }
 530
 531        /* If only 8 bytes returned, keyed message will be dropped here */
 532        if (skb_headlen(skb) < grehlen)
 533                return;
 534
 535        if (flags & GRE_KEY)
 536                key = *(((__be32 *)p) + (grehlen / 4) - 1);
 537
 538        switch (type) {
 539        default:
 540        case ICMP_PARAMETERPROB:
 541                return;
 542
 543        case ICMP_DEST_UNREACH:
 544                switch (code) {
 545                case ICMP_SR_FAILED:
 546                case ICMP_PORT_UNREACH:
 547                        /* Impossible event. */
 548                        return;
 549                default:
 550                        /* All others are translated to HOST_UNREACH.
 551                           rfc2003 contains "deep thoughts" about NET_UNREACH,
 552                           I believe they are just ether pollution. --ANK
 553                         */
 554                        break;
 555                }
 556                break;
 557        case ICMP_TIME_EXCEEDED:
 558                if (code != ICMP_EXC_TTL)
 559                        return;
 560                break;
 561
 562        case ICMP_REDIRECT:
 563                break;
 564        }
 565
 566        t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
 567                                flags, key, p[1]);
 568
 569        if (t == NULL)
 570                return;
 571
 572        if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
 573                ipv4_update_pmtu(skb, dev_net(skb->dev), info,
 574                                 t->parms.link, 0, IPPROTO_GRE, 0);
 575                return;
 576        }
 577        if (type == ICMP_REDIRECT) {
 578                ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
 579                              IPPROTO_GRE, 0);
 580                return;
 581        }
 582        if (t->parms.iph.daddr == 0 ||
 583            ipv4_is_multicast(t->parms.iph.daddr))
 584                return;
 585
 586        if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
 587                return;
 588
 589        if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
 590                t->err_count++;
 591        else
 592                t->err_count = 1;
 593        t->err_time = jiffies;
 594}
 595
 596static inline u8
 597ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
 598{
 599        u8 inner = 0;
 600        if (skb->protocol == htons(ETH_P_IP))
 601                inner = old_iph->tos;
 602        else if (skb->protocol == htons(ETH_P_IPV6))
 603                inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
 604        return INET_ECN_encapsulate(tos, inner);
 605}
 606
 607static int ipgre_rcv(struct sk_buff *skb)
 608{
 609        const struct iphdr *iph;
 610        u8     *h;
 611        __be16    flags;
 612        __sum16   csum = 0;
 613        __be32 key = 0;
 614        u32    seqno = 0;
 615        struct ip_tunnel *tunnel;
 616        int    offset = 4;
 617        __be16 gre_proto;
 618        int    err;
 619
 620        if (!pskb_may_pull(skb, 16))
 621                goto drop;
 622
 623        iph = ip_hdr(skb);
 624        h = skb->data;
 625        flags = *(__be16 *)h;
 626
 627        if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
 628                /* - Version must be 0.
 629                   - We do not support routing headers.
 630                 */
 631                if (flags&(GRE_VERSION|GRE_ROUTING))
 632                        goto drop;
 633
 634                if (flags&GRE_CSUM) {
 635                        switch (skb->ip_summed) {
 636                        case CHECKSUM_COMPLETE:
 637                                csum = csum_fold(skb->csum);
 638                                if (!csum)
 639                                        break;
 640                                /* fall through */
 641                        case CHECKSUM_NONE:
 642                                skb->csum = 0;
 643                                csum = __skb_checksum_complete(skb);
 644                                skb->ip_summed = CHECKSUM_COMPLETE;
 645                        }
 646                        offset += 4;
 647                }
 648                if (flags&GRE_KEY) {
 649                        key = *(__be32 *)(h + offset);
 650                        offset += 4;
 651                }
 652                if (flags&GRE_SEQ) {
 653                        seqno = ntohl(*(__be32 *)(h + offset));
 654                        offset += 4;
 655                }
 656        }
 657
 658        gre_proto = *(__be16 *)(h + 2);
 659
 660        tunnel = ipgre_tunnel_lookup(skb->dev,
 661                                     iph->saddr, iph->daddr, flags, key,
 662                                     gre_proto);
 663        if (tunnel) {
 664                struct pcpu_tstats *tstats;
 665
 666                secpath_reset(skb);
 667
 668                skb->protocol = gre_proto;
 669                /* WCCP version 1 and 2 protocol decoding.
 670                 * - Change protocol to IP
 671                 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
 672                 */
 673                if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
 674                        skb->protocol = htons(ETH_P_IP);
 675                        if ((*(h + offset) & 0xF0) != 0x40)
 676                                offset += 4;
 677                }
 678
 679                skb->mac_header = skb->network_header;
 680                __pskb_pull(skb, offset);
 681                skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
 682                skb->pkt_type = PACKET_HOST;
 683#ifdef CONFIG_NET_IPGRE_BROADCAST
 684                if (ipv4_is_multicast(iph->daddr)) {
 685                        /* Looped back packet, drop it! */
 686                        if (rt_is_output_route(skb_rtable(skb)))
 687                                goto drop;
 688                        tunnel->dev->stats.multicast++;
 689                        skb->pkt_type = PACKET_BROADCAST;
 690                }
 691#endif
 692
 693                if (((flags&GRE_CSUM) && csum) ||
 694                    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
 695                        tunnel->dev->stats.rx_crc_errors++;
 696                        tunnel->dev->stats.rx_errors++;
 697                        goto drop;
 698                }
 699                if (tunnel->parms.i_flags&GRE_SEQ) {
 700                        if (!(flags&GRE_SEQ) ||
 701                            (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
 702                                tunnel->dev->stats.rx_fifo_errors++;
 703                                tunnel->dev->stats.rx_errors++;
 704                                goto drop;
 705                        }
 706                        tunnel->i_seqno = seqno + 1;
 707                }
 708
 709                /* Warning: All skb pointers will be invalidated! */
 710                if (tunnel->dev->type == ARPHRD_ETHER) {
 711                        if (!pskb_may_pull(skb, ETH_HLEN)) {
 712                                tunnel->dev->stats.rx_length_errors++;
 713                                tunnel->dev->stats.rx_errors++;
 714                                goto drop;
 715                        }
 716
 717                        iph = ip_hdr(skb);
 718                        skb->protocol = eth_type_trans(skb, tunnel->dev);
 719                        skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 720                }
 721
 722                __skb_tunnel_rx(skb, tunnel->dev);
 723
 724                skb_reset_network_header(skb);
 725                err = IP_ECN_decapsulate(iph, skb);
 726                if (unlikely(err)) {
 727                        if (log_ecn_error)
 728                                net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
 729                                                     &iph->saddr, iph->tos);
 730                        if (err > 1) {
 731                                ++tunnel->dev->stats.rx_frame_errors;
 732                                ++tunnel->dev->stats.rx_errors;
 733                                goto drop;
 734                        }
 735                }
 736
 737                tstats = this_cpu_ptr(tunnel->dev->tstats);
 738                u64_stats_update_begin(&tstats->syncp);
 739                tstats->rx_packets++;
 740                tstats->rx_bytes += skb->len;
 741                u64_stats_update_end(&tstats->syncp);
 742
 743                gro_cells_receive(&tunnel->gro_cells, skb);
 744                return 0;
 745        }
 746        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 747
 748drop:
 749        kfree_skb(skb);
 750        return 0;
 751}
 752
 753static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 754{
 755        struct ip_tunnel *tunnel = netdev_priv(dev);
 756        struct pcpu_tstats *tstats;
 757        const struct iphdr  *old_iph = ip_hdr(skb);
 758        const struct iphdr  *tiph;
 759        struct flowi4 fl4;
 760        u8     tos;
 761        __be16 df;
 762        struct rtable *rt;                      /* Route to the other host */
 763        struct net_device *tdev;                /* Device to other host */
 764        struct iphdr  *iph;                     /* Our new IP header */
 765        unsigned int max_headroom;              /* The extra header space needed */
 766        int    gre_hlen;
 767        __be32 dst;
 768        int    mtu;
 769
 770        if (skb->ip_summed == CHECKSUM_PARTIAL &&
 771            skb_checksum_help(skb))
 772                goto tx_error;
 773
 774        if (dev->type == ARPHRD_ETHER)
 775                IPCB(skb)->flags = 0;
 776
 777        if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
 778                gre_hlen = 0;
 779                tiph = (const struct iphdr *)skb->data;
 780        } else {
 781                gre_hlen = tunnel->hlen;
 782                tiph = &tunnel->parms.iph;
 783        }
 784
 785        if ((dst = tiph->daddr) == 0) {
 786                /* NBMA tunnel */
 787
 788                if (skb_dst(skb) == NULL) {
 789                        dev->stats.tx_fifo_errors++;
 790                        goto tx_error;
 791                }
 792
 793                if (skb->protocol == htons(ETH_P_IP)) {
 794                        rt = skb_rtable(skb);
 795                        dst = rt_nexthop(rt, old_iph->daddr);
 796                }
 797#if IS_ENABLED(CONFIG_IPV6)
 798                else if (skb->protocol == htons(ETH_P_IPV6)) {
 799                        const struct in6_addr *addr6;
 800                        struct neighbour *neigh;
 801                        bool do_tx_error_icmp;
 802                        int addr_type;
 803
 804                        neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
 805                        if (neigh == NULL)
 806                                goto tx_error;
 807
 808                        addr6 = (const struct in6_addr *)&neigh->primary_key;
 809                        addr_type = ipv6_addr_type(addr6);
 810
 811                        if (addr_type == IPV6_ADDR_ANY) {
 812                                addr6 = &ipv6_hdr(skb)->daddr;
 813                                addr_type = ipv6_addr_type(addr6);
 814                        }
 815
 816                        if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
 817                                do_tx_error_icmp = true;
 818                        else {
 819                                do_tx_error_icmp = false;
 820                                dst = addr6->s6_addr32[3];
 821                        }
 822                        neigh_release(neigh);
 823                        if (do_tx_error_icmp)
 824                                goto tx_error_icmp;
 825                }
 826#endif
 827                else
 828                        goto tx_error;
 829        }
 830
 831        tos = tiph->tos;
 832        if (tos == 1) {
 833                tos = 0;
 834                if (skb->protocol == htons(ETH_P_IP))
 835                        tos = old_iph->tos;
 836                else if (skb->protocol == htons(ETH_P_IPV6))
 837                        tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
 838        }
 839
 840        rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
 841                                 tunnel->parms.o_key, RT_TOS(tos),
 842                                 tunnel->parms.link);
 843        if (IS_ERR(rt)) {
 844                dev->stats.tx_carrier_errors++;
 845                goto tx_error;
 846        }
 847        tdev = rt->dst.dev;
 848
 849        if (tdev == dev) {
 850                ip_rt_put(rt);
 851                dev->stats.collisions++;
 852                goto tx_error;
 853        }
 854
 855        df = tiph->frag_off;
 856        if (df)
 857                mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
 858        else
 859                mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
 860
 861        if (skb_dst(skb))
 862                skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
 863
 864        if (skb->protocol == htons(ETH_P_IP)) {
 865                df |= (old_iph->frag_off&htons(IP_DF));
 866
 867                if ((old_iph->frag_off&htons(IP_DF)) &&
 868                    mtu < ntohs(old_iph->tot_len)) {
 869                        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
 870                        ip_rt_put(rt);
 871                        goto tx_error;
 872                }
 873        }
 874#if IS_ENABLED(CONFIG_IPV6)
 875        else if (skb->protocol == htons(ETH_P_IPV6)) {
 876                struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
 877
 878                if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
 879                        if ((tunnel->parms.iph.daddr &&
 880                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
 881                            rt6->rt6i_dst.plen == 128) {
 882                                rt6->rt6i_flags |= RTF_MODIFIED;
 883                                dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
 884                        }
 885                }
 886
 887                if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
 888                        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 889                        ip_rt_put(rt);
 890                        goto tx_error;
 891                }
 892        }
 893#endif
 894
 895        if (tunnel->err_count > 0) {
 896                if (time_before(jiffies,
 897                                tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
 898                        tunnel->err_count--;
 899
 900                        dst_link_failure(skb);
 901                } else
 902                        tunnel->err_count = 0;
 903        }
 904
 905        max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
 906
 907        if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
 908            (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
 909                struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
 910                if (max_headroom > dev->needed_headroom)
 911                        dev->needed_headroom = max_headroom;
 912                if (!new_skb) {
 913                        ip_rt_put(rt);
 914                        dev->stats.tx_dropped++;
 915                        dev_kfree_skb(skb);
 916                        return NETDEV_TX_OK;
 917                }
 918                if (skb->sk)
 919                        skb_set_owner_w(new_skb, skb->sk);
 920                dev_kfree_skb(skb);
 921                skb = new_skb;
 922                old_iph = ip_hdr(skb);
 923        }
 924
 925        skb_reset_transport_header(skb);
 926        skb_push(skb, gre_hlen);
 927        skb_reset_network_header(skb);
 928        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 929        IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
 930                              IPSKB_REROUTED);
 931        skb_dst_drop(skb);
 932        skb_dst_set(skb, &rt->dst);
 933
 934        /*
 935         *      Push down and install the IPIP header.
 936         */
 937
 938        iph                     =       ip_hdr(skb);
 939        iph->version            =       4;
 940        iph->ihl                =       sizeof(struct iphdr) >> 2;
 941        iph->frag_off           =       df;
 942        iph->protocol           =       IPPROTO_GRE;
 943        iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
 944        iph->daddr              =       fl4.daddr;
 945        iph->saddr              =       fl4.saddr;
 946
 947        if ((iph->ttl = tiph->ttl) == 0) {
 948                if (skb->protocol == htons(ETH_P_IP))
 949                        iph->ttl = old_iph->ttl;
 950#if IS_ENABLED(CONFIG_IPV6)
 951                else if (skb->protocol == htons(ETH_P_IPV6))
 952                        iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
 953#endif
 954                else
 955                        iph->ttl = ip4_dst_hoplimit(&rt->dst);
 956        }
 957
 958        ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
 959        ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
 960                                   htons(ETH_P_TEB) : skb->protocol;
 961
 962        if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
 963                __be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4);
 964
 965                if (tunnel->parms.o_flags&GRE_SEQ) {
 966                        ++tunnel->o_seqno;
 967                        *ptr = htonl(tunnel->o_seqno);
 968                        ptr--;
 969                }
 970                if (tunnel->parms.o_flags&GRE_KEY) {
 971                        *ptr = tunnel->parms.o_key;
 972                        ptr--;
 973                }
 974                if (tunnel->parms.o_flags&GRE_CSUM) {
 975                        *ptr = 0;
 976                        *(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr));
 977                }
 978        }
 979
 980        nf_reset(skb);
 981        tstats = this_cpu_ptr(dev->tstats);
 982        __IPTUNNEL_XMIT(tstats, &dev->stats);
 983        return NETDEV_TX_OK;
 984
 985#if IS_ENABLED(CONFIG_IPV6)
 986tx_error_icmp:
 987        dst_link_failure(skb);
 988#endif
 989tx_error:
 990        dev->stats.tx_errors++;
 991        dev_kfree_skb(skb);
 992        return NETDEV_TX_OK;
 993}
 994
 995static int ipgre_tunnel_bind_dev(struct net_device *dev)
 996{
 997        struct net_device *tdev = NULL;
 998        struct ip_tunnel *tunnel;
 999        const struct iphdr *iph;
1000        int hlen = LL_MAX_HEADER;
1001        int mtu = ETH_DATA_LEN;
1002        int addend = sizeof(struct iphdr) + 4;
1003
1004        tunnel = netdev_priv(dev);
1005        iph = &tunnel->parms.iph;
1006
1007        /* Guess output device to choose reasonable mtu and needed_headroom */
1008
1009        if (iph->daddr) {
1010                struct flowi4 fl4;
1011                struct rtable *rt;
1012
1013                rt = ip_route_output_gre(dev_net(dev), &fl4,
1014                                         iph->daddr, iph->saddr,
1015                                         tunnel->parms.o_key,
1016                                         RT_TOS(iph->tos),
1017                                         tunnel->parms.link);
1018                if (!IS_ERR(rt)) {
1019                        tdev = rt->dst.dev;
1020                        ip_rt_put(rt);
1021                }
1022
1023                if (dev->type != ARPHRD_ETHER)
1024                        dev->flags |= IFF_POINTOPOINT;
1025        }
1026
1027        if (!tdev && tunnel->parms.link)
1028                tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
1029
1030        if (tdev) {
1031                hlen = tdev->hard_header_len + tdev->needed_headroom;
1032                mtu = tdev->mtu;
1033        }
1034        dev->iflink = tunnel->parms.link;
1035
1036        /* Precalculate GRE options length */
1037        if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
1038                if (tunnel->parms.o_flags&GRE_CSUM)
1039                        addend += 4;
1040                if (tunnel->parms.o_flags&GRE_KEY)
1041                        addend += 4;
1042                if (tunnel->parms.o_flags&GRE_SEQ)
1043                        addend += 4;
1044        }
1045        dev->needed_headroom = addend + hlen;
1046        mtu -= dev->hard_header_len + addend;
1047
1048        if (mtu < 68)
1049                mtu = 68;
1050
1051        tunnel->hlen = addend;
1052
1053        return mtu;
1054}
1055
1056static int
1057ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1058{
1059        int err = 0;
1060        struct ip_tunnel_parm p;
1061        struct ip_tunnel *t;
1062        struct net *net = dev_net(dev);
1063        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1064
1065        switch (cmd) {
1066        case SIOCGETTUNNEL:
1067                t = NULL;
1068                if (dev == ign->fb_tunnel_dev) {
1069                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1070                                err = -EFAULT;
1071                                break;
1072                        }
1073                        t = ipgre_tunnel_locate(net, &p, 0);
1074                }
1075                if (t == NULL)
1076                        t = netdev_priv(dev);
1077                memcpy(&p, &t->parms, sizeof(p));
1078                if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1079                        err = -EFAULT;
1080                break;
1081
1082        case SIOCADDTUNNEL:
1083        case SIOCCHGTUNNEL:
1084                err = -EPERM;
1085                if (!capable(CAP_NET_ADMIN))
1086                        goto done;
1087
1088                err = -EFAULT;
1089                if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1090                        goto done;
1091
1092                err = -EINVAL;
1093                if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1094                    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1095                    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1096                        goto done;
1097                if (p.iph.ttl)
1098                        p.iph.frag_off |= htons(IP_DF);
1099
1100                if (!(p.i_flags&GRE_KEY))
1101                        p.i_key = 0;
1102                if (!(p.o_flags&GRE_KEY))
1103                        p.o_key = 0;
1104
1105                t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1106
1107                if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1108                        if (t != NULL) {
1109                                if (t->dev != dev) {
1110                                        err = -EEXIST;
1111                                        break;
1112                                }
1113                        } else {
1114                                unsigned int nflags = 0;
1115
1116                                t = netdev_priv(dev);
1117
1118                                if (ipv4_is_multicast(p.iph.daddr))
1119                                        nflags = IFF_BROADCAST;
1120                                else if (p.iph.daddr)
1121                                        nflags = IFF_POINTOPOINT;
1122
1123                                if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1124                                        err = -EINVAL;
1125                                        break;
1126                                }
1127                                ipgre_tunnel_unlink(ign, t);
1128                                synchronize_net();
1129                                t->parms.iph.saddr = p.iph.saddr;
1130                                t->parms.iph.daddr = p.iph.daddr;
1131                                t->parms.i_key = p.i_key;
1132                                t->parms.o_key = p.o_key;
1133                                memcpy(dev->dev_addr, &p.iph.saddr, 4);
1134                                memcpy(dev->broadcast, &p.iph.daddr, 4);
1135                                ipgre_tunnel_link(ign, t);
1136                                netdev_state_change(dev);
1137                        }
1138                }
1139
1140                if (t) {
1141                        err = 0;
1142                        if (cmd == SIOCCHGTUNNEL) {
1143                                t->parms.iph.ttl = p.iph.ttl;
1144                                t->parms.iph.tos = p.iph.tos;
1145                                t->parms.iph.frag_off = p.iph.frag_off;
1146                                if (t->parms.link != p.link) {
1147                                        t->parms.link = p.link;
1148                                        dev->mtu = ipgre_tunnel_bind_dev(dev);
1149                                        netdev_state_change(dev);
1150                                }
1151                        }
1152                        if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1153                                err = -EFAULT;
1154                } else
1155                        err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1156                break;
1157
1158        case SIOCDELTUNNEL:
1159                err = -EPERM;
1160                if (!capable(CAP_NET_ADMIN))
1161                        goto done;
1162
1163                if (dev == ign->fb_tunnel_dev) {
1164                        err = -EFAULT;
1165                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1166                                goto done;
1167                        err = -ENOENT;
1168                        if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1169                                goto done;
1170                        err = -EPERM;
1171                        if (t == netdev_priv(ign->fb_tunnel_dev))
1172                                goto done;
1173                        dev = t->dev;
1174                }
1175                unregister_netdevice(dev);
1176                err = 0;
1177                break;
1178
1179        default:
1180                err = -EINVAL;
1181        }
1182
1183done:
1184        return err;
1185}
1186
1187static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1188{
1189        struct ip_tunnel *tunnel = netdev_priv(dev);
1190        if (new_mtu < 68 ||
1191            new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1192                return -EINVAL;
1193        dev->mtu = new_mtu;
1194        return 0;
1195}
1196
1197/* Nice toy. Unfortunately, useless in real life :-)
1198   It allows to construct virtual multiprotocol broadcast "LAN"
1199   over the Internet, provided multicast routing is tuned.
1200
1201
1202   I have no idea was this bicycle invented before me,
1203   so that I had to set ARPHRD_IPGRE to a random value.
1204   I have an impression, that Cisco could make something similar,
1205   but this feature is apparently missing in IOS<=11.2(8).
1206
1207   I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1208   with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1209
1210   ping -t 255 224.66.66.66
1211
1212   If nobody answers, mbone does not work.
1213
1214   ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1215   ip addr add 10.66.66.<somewhat>/24 dev Universe
1216   ifconfig Universe up
1217   ifconfig Universe add fe80::<Your_real_addr>/10
1218   ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1219   ftp 10.66.66.66
1220   ...
1221   ftp fec0:6666:6666::193.233.7.65
1222   ...
1223
1224 */
1225
1226static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1227                        unsigned short type,
1228                        const void *daddr, const void *saddr, unsigned int len)
1229{
1230        struct ip_tunnel *t = netdev_priv(dev);
1231        struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1232        __be16 *p = (__be16 *)(iph+1);
1233
1234        memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1235        p[0]            = t->parms.o_flags;
1236        p[1]            = htons(type);
1237
1238        /*
1239         *      Set the source hardware address.
1240         */
1241
1242        if (saddr)
1243                memcpy(&iph->saddr, saddr, 4);
1244        if (daddr)
1245                memcpy(&iph->daddr, daddr, 4);
1246        if (iph->daddr)
1247                return t->hlen;
1248
1249        return -t->hlen;
1250}
1251
1252static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1253{
1254        const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1255        memcpy(haddr, &iph->saddr, 4);
1256        return 4;
1257}
1258
1259static const struct header_ops ipgre_header_ops = {
1260        .create = ipgre_header,
1261        .parse  = ipgre_header_parse,
1262};
1263
1264#ifdef CONFIG_NET_IPGRE_BROADCAST
1265static int ipgre_open(struct net_device *dev)
1266{
1267        struct ip_tunnel *t = netdev_priv(dev);
1268
1269        if (ipv4_is_multicast(t->parms.iph.daddr)) {
1270                struct flowi4 fl4;
1271                struct rtable *rt;
1272
1273                rt = ip_route_output_gre(dev_net(dev), &fl4,
1274                                         t->parms.iph.daddr,
1275                                         t->parms.iph.saddr,
1276                                         t->parms.o_key,
1277                                         RT_TOS(t->parms.iph.tos),
1278                                         t->parms.link);
1279                if (IS_ERR(rt))
1280                        return -EADDRNOTAVAIL;
1281                dev = rt->dst.dev;
1282                ip_rt_put(rt);
1283                if (__in_dev_get_rtnl(dev) == NULL)
1284                        return -EADDRNOTAVAIL;
1285                t->mlink = dev->ifindex;
1286                ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1287        }
1288        return 0;
1289}
1290
1291static int ipgre_close(struct net_device *dev)
1292{
1293        struct ip_tunnel *t = netdev_priv(dev);
1294
1295        if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1296                struct in_device *in_dev;
1297                in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1298                if (in_dev)
1299                        ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1300        }
1301        return 0;
1302}
1303
1304#endif
1305
1306static const struct net_device_ops ipgre_netdev_ops = {
1307        .ndo_init               = ipgre_tunnel_init,
1308        .ndo_uninit             = ipgre_tunnel_uninit,
1309#ifdef CONFIG_NET_IPGRE_BROADCAST
1310        .ndo_open               = ipgre_open,
1311        .ndo_stop               = ipgre_close,
1312#endif
1313        .ndo_start_xmit         = ipgre_tunnel_xmit,
1314        .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1315        .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1316        .ndo_get_stats64        = ipgre_get_stats64,
1317};
1318
1319static void ipgre_dev_free(struct net_device *dev)
1320{
1321        struct ip_tunnel *tunnel = netdev_priv(dev);
1322
1323        gro_cells_destroy(&tunnel->gro_cells);
1324        free_percpu(dev->tstats);
1325        free_netdev(dev);
1326}
1327
1328#define GRE_FEATURES (NETIF_F_SG |              \
1329                      NETIF_F_FRAGLIST |        \
1330                      NETIF_F_HIGHDMA |         \
1331                      NETIF_F_HW_CSUM)
1332
1333static void ipgre_tunnel_setup(struct net_device *dev)
1334{
1335        dev->netdev_ops         = &ipgre_netdev_ops;
1336        dev->destructor         = ipgre_dev_free;
1337
1338        dev->type               = ARPHRD_IPGRE;
1339        dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1340        dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1341        dev->flags              = IFF_NOARP;
1342        dev->iflink             = 0;
1343        dev->addr_len           = 4;
1344        dev->features           |= NETIF_F_NETNS_LOCAL;
1345        dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1346
1347        dev->features           |= GRE_FEATURES;
1348        dev->hw_features        |= GRE_FEATURES;
1349}
1350
1351static int ipgre_tunnel_init(struct net_device *dev)
1352{
1353        struct ip_tunnel *tunnel;
1354        struct iphdr *iph;
1355        int err;
1356
1357        tunnel = netdev_priv(dev);
1358        iph = &tunnel->parms.iph;
1359
1360        tunnel->dev = dev;
1361        strcpy(tunnel->parms.name, dev->name);
1362
1363        memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1364        memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1365
1366        if (iph->daddr) {
1367#ifdef CONFIG_NET_IPGRE_BROADCAST
1368                if (ipv4_is_multicast(iph->daddr)) {
1369                        if (!iph->saddr)
1370                                return -EINVAL;
1371                        dev->flags = IFF_BROADCAST;
1372                        dev->header_ops = &ipgre_header_ops;
1373                }
1374#endif
1375        } else
1376                dev->header_ops = &ipgre_header_ops;
1377
1378        dev->tstats = alloc_percpu(struct pcpu_tstats);
1379        if (!dev->tstats)
1380                return -ENOMEM;
1381
1382        err = gro_cells_init(&tunnel->gro_cells, dev);
1383        if (err) {
1384                free_percpu(dev->tstats);
1385                return err;
1386        }
1387
1388        return 0;
1389}
1390
1391static void ipgre_fb_tunnel_init(struct net_device *dev)
1392{
1393        struct ip_tunnel *tunnel = netdev_priv(dev);
1394        struct iphdr *iph = &tunnel->parms.iph;
1395
1396        tunnel->dev = dev;
1397        strcpy(tunnel->parms.name, dev->name);
1398
1399        iph->version            = 4;
1400        iph->protocol           = IPPROTO_GRE;
1401        iph->ihl                = 5;
1402        tunnel->hlen            = sizeof(struct iphdr) + 4;
1403
1404        dev_hold(dev);
1405}
1406
1407
1408static const struct gre_protocol ipgre_protocol = {
1409        .handler     = ipgre_rcv,
1410        .err_handler = ipgre_err,
1411};
1412
1413static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1414{
1415        int prio;
1416
1417        for (prio = 0; prio < 4; prio++) {
1418                int h;
1419                for (h = 0; h < HASH_SIZE; h++) {
1420                        struct ip_tunnel *t;
1421
1422                        t = rtnl_dereference(ign->tunnels[prio][h]);
1423
1424                        while (t != NULL) {
1425                                unregister_netdevice_queue(t->dev, head);
1426                                t = rtnl_dereference(t->next);
1427                        }
1428                }
1429        }
1430}
1431
1432static int __net_init ipgre_init_net(struct net *net)
1433{
1434        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1435        int err;
1436
1437        ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1438                                           ipgre_tunnel_setup);
1439        if (!ign->fb_tunnel_dev) {
1440                err = -ENOMEM;
1441                goto err_alloc_dev;
1442        }
1443        dev_net_set(ign->fb_tunnel_dev, net);
1444
1445        ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1446        ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1447
1448        if ((err = register_netdev(ign->fb_tunnel_dev)))
1449                goto err_reg_dev;
1450
1451        rcu_assign_pointer(ign->tunnels_wc[0],
1452                           netdev_priv(ign->fb_tunnel_dev));
1453        return 0;
1454
1455err_reg_dev:
1456        ipgre_dev_free(ign->fb_tunnel_dev);
1457err_alloc_dev:
1458        return err;
1459}
1460
1461static void __net_exit ipgre_exit_net(struct net *net)
1462{
1463        struct ipgre_net *ign;
1464        LIST_HEAD(list);
1465
1466        ign = net_generic(net, ipgre_net_id);
1467        rtnl_lock();
1468        ipgre_destroy_tunnels(ign, &list);
1469        unregister_netdevice_many(&list);
1470        rtnl_unlock();
1471}
1472
1473static struct pernet_operations ipgre_net_ops = {
1474        .init = ipgre_init_net,
1475        .exit = ipgre_exit_net,
1476        .id   = &ipgre_net_id,
1477        .size = sizeof(struct ipgre_net),
1478};
1479
1480static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1481{
1482        __be16 flags;
1483
1484        if (!data)
1485                return 0;
1486
1487        flags = 0;
1488        if (data[IFLA_GRE_IFLAGS])
1489                flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1490        if (data[IFLA_GRE_OFLAGS])
1491                flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1492        if (flags & (GRE_VERSION|GRE_ROUTING))
1493                return -EINVAL;
1494
1495        return 0;
1496}
1497
1498static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1499{
1500        __be32 daddr;
1501
1502        if (tb[IFLA_ADDRESS]) {
1503                if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1504                        return -EINVAL;
1505                if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1506                        return -EADDRNOTAVAIL;
1507        }
1508
1509        if (!data)
1510                goto out;
1511
1512        if (data[IFLA_GRE_REMOTE]) {
1513                memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1514                if (!daddr)
1515                        return -EINVAL;
1516        }
1517
1518out:
1519        return ipgre_tunnel_validate(tb, data);
1520}
1521
1522static void ipgre_netlink_parms(struct nlattr *data[],
1523                                struct ip_tunnel_parm *parms)
1524{
1525        memset(parms, 0, sizeof(*parms));
1526
1527        parms->iph.protocol = IPPROTO_GRE;
1528
1529        if (!data)
1530                return;
1531
1532        if (data[IFLA_GRE_LINK])
1533                parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1534
1535        if (data[IFLA_GRE_IFLAGS])
1536                parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1537
1538        if (data[IFLA_GRE_OFLAGS])
1539                parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1540
1541        if (data[IFLA_GRE_IKEY])
1542                parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1543
1544        if (data[IFLA_GRE_OKEY])
1545                parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1546
1547        if (data[IFLA_GRE_LOCAL])
1548                parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1549
1550        if (data[IFLA_GRE_REMOTE])
1551                parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1552
1553        if (data[IFLA_GRE_TTL])
1554                parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1555
1556        if (data[IFLA_GRE_TOS])
1557                parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1558
1559        if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1560                parms->iph.frag_off = htons(IP_DF);
1561}
1562
1563static int ipgre_tap_init(struct net_device *dev)
1564{
1565        struct ip_tunnel *tunnel;
1566
1567        tunnel = netdev_priv(dev);
1568
1569        tunnel->dev = dev;
1570        strcpy(tunnel->parms.name, dev->name);
1571
1572        ipgre_tunnel_bind_dev(dev);
1573
1574        dev->tstats = alloc_percpu(struct pcpu_tstats);
1575        if (!dev->tstats)
1576                return -ENOMEM;
1577
1578        return 0;
1579}
1580
1581static const struct net_device_ops ipgre_tap_netdev_ops = {
1582        .ndo_init               = ipgre_tap_init,
1583        .ndo_uninit             = ipgre_tunnel_uninit,
1584        .ndo_start_xmit         = ipgre_tunnel_xmit,
1585        .ndo_set_mac_address    = eth_mac_addr,
1586        .ndo_validate_addr      = eth_validate_addr,
1587        .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1588        .ndo_get_stats64        = ipgre_get_stats64,
1589};
1590
1591static void ipgre_tap_setup(struct net_device *dev)
1592{
1593
1594        ether_setup(dev);
1595
1596        dev->netdev_ops         = &ipgre_tap_netdev_ops;
1597        dev->destructor         = ipgre_dev_free;
1598
1599        dev->iflink             = 0;
1600        dev->features           |= NETIF_F_NETNS_LOCAL;
1601}
1602
1603static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1604                         struct nlattr *data[])
1605{
1606        struct ip_tunnel *nt;
1607        struct net *net = dev_net(dev);
1608        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1609        int mtu;
1610        int err;
1611
1612        nt = netdev_priv(dev);
1613        ipgre_netlink_parms(data, &nt->parms);
1614
1615        if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1616                return -EEXIST;
1617
1618        if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1619                eth_hw_addr_random(dev);
1620
1621        mtu = ipgre_tunnel_bind_dev(dev);
1622        if (!tb[IFLA_MTU])
1623                dev->mtu = mtu;
1624
1625        /* Can use a lockless transmit, unless we generate output sequences */
1626        if (!(nt->parms.o_flags & GRE_SEQ))
1627                dev->features |= NETIF_F_LLTX;
1628
1629        err = register_netdevice(dev);
1630        if (err)
1631                goto out;
1632
1633        dev_hold(dev);
1634        ipgre_tunnel_link(ign, nt);
1635
1636out:
1637        return err;
1638}
1639
1640static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1641                            struct nlattr *data[])
1642{
1643        struct ip_tunnel *t, *nt;
1644        struct net *net = dev_net(dev);
1645        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1646        struct ip_tunnel_parm p;
1647        int mtu;
1648
1649        if (dev == ign->fb_tunnel_dev)
1650                return -EINVAL;
1651
1652        nt = netdev_priv(dev);
1653        ipgre_netlink_parms(data, &p);
1654
1655        t = ipgre_tunnel_locate(net, &p, 0);
1656
1657        if (t) {
1658                if (t->dev != dev)
1659                        return -EEXIST;
1660        } else {
1661                t = nt;
1662
1663                if (dev->type != ARPHRD_ETHER) {
1664                        unsigned int nflags = 0;
1665
1666                        if (ipv4_is_multicast(p.iph.daddr))
1667                                nflags = IFF_BROADCAST;
1668                        else if (p.iph.daddr)
1669                                nflags = IFF_POINTOPOINT;
1670
1671                        if ((dev->flags ^ nflags) &
1672                            (IFF_POINTOPOINT | IFF_BROADCAST))
1673                                return -EINVAL;
1674                }
1675
1676                ipgre_tunnel_unlink(ign, t);
1677                t->parms.iph.saddr = p.iph.saddr;
1678                t->parms.iph.daddr = p.iph.daddr;
1679                t->parms.i_key = p.i_key;
1680                if (dev->type != ARPHRD_ETHER) {
1681                        memcpy(dev->dev_addr, &p.iph.saddr, 4);
1682                        memcpy(dev->broadcast, &p.iph.daddr, 4);
1683                }
1684                ipgre_tunnel_link(ign, t);
1685                netdev_state_change(dev);
1686        }
1687
1688        t->parms.o_key = p.o_key;
1689        t->parms.iph.ttl = p.iph.ttl;
1690        t->parms.iph.tos = p.iph.tos;
1691        t->parms.iph.frag_off = p.iph.frag_off;
1692
1693        if (t->parms.link != p.link) {
1694                t->parms.link = p.link;
1695                mtu = ipgre_tunnel_bind_dev(dev);
1696                if (!tb[IFLA_MTU])
1697                        dev->mtu = mtu;
1698                netdev_state_change(dev);
1699        }
1700
1701        return 0;
1702}
1703
1704static size_t ipgre_get_size(const struct net_device *dev)
1705{
1706        return
1707                /* IFLA_GRE_LINK */
1708                nla_total_size(4) +
1709                /* IFLA_GRE_IFLAGS */
1710                nla_total_size(2) +
1711                /* IFLA_GRE_OFLAGS */
1712                nla_total_size(2) +
1713                /* IFLA_GRE_IKEY */
1714                nla_total_size(4) +
1715                /* IFLA_GRE_OKEY */
1716                nla_total_size(4) +
1717                /* IFLA_GRE_LOCAL */
1718                nla_total_size(4) +
1719                /* IFLA_GRE_REMOTE */
1720                nla_total_size(4) +
1721                /* IFLA_GRE_TTL */
1722                nla_total_size(1) +
1723                /* IFLA_GRE_TOS */
1724                nla_total_size(1) +
1725                /* IFLA_GRE_PMTUDISC */
1726                nla_total_size(1) +
1727                0;
1728}
1729
1730static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1731{
1732        struct ip_tunnel *t = netdev_priv(dev);
1733        struct ip_tunnel_parm *p = &t->parms;
1734
1735        if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1736            nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) ||
1737            nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) ||
1738            nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1739            nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1740            nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1741            nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1742            nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1743            nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1744            nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1745                       !!(p->iph.frag_off & htons(IP_DF))))
1746                goto nla_put_failure;
1747        return 0;
1748
1749nla_put_failure:
1750        return -EMSGSIZE;
1751}
1752
1753static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1754        [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1755        [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1756        [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1757        [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1758        [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1759        [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1760        [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1761        [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1762        [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1763        [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1764};
1765
1766static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1767        .kind           = "gre",
1768        .maxtype        = IFLA_GRE_MAX,
1769        .policy         = ipgre_policy,
1770        .priv_size      = sizeof(struct ip_tunnel),
1771        .setup          = ipgre_tunnel_setup,
1772        .validate       = ipgre_tunnel_validate,
1773        .newlink        = ipgre_newlink,
1774        .changelink     = ipgre_changelink,
1775        .get_size       = ipgre_get_size,
1776        .fill_info      = ipgre_fill_info,
1777};
1778
1779static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1780        .kind           = "gretap",
1781        .maxtype        = IFLA_GRE_MAX,
1782        .policy         = ipgre_policy,
1783        .priv_size      = sizeof(struct ip_tunnel),
1784        .setup          = ipgre_tap_setup,
1785        .validate       = ipgre_tap_validate,
1786        .newlink        = ipgre_newlink,
1787        .changelink     = ipgre_changelink,
1788        .get_size       = ipgre_get_size,
1789        .fill_info      = ipgre_fill_info,
1790};
1791
1792/*
1793 *      And now the modules code and kernel interface.
1794 */
1795
1796static int __init ipgre_init(void)
1797{
1798        int err;
1799
1800        pr_info("GRE over IPv4 tunneling driver\n");
1801
1802        err = register_pernet_device(&ipgre_net_ops);
1803        if (err < 0)
1804                return err;
1805
1806        err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1807        if (err < 0) {
1808                pr_info("%s: can't add protocol\n", __func__);
1809                goto add_proto_failed;
1810        }
1811
1812        err = rtnl_link_register(&ipgre_link_ops);
1813        if (err < 0)
1814                goto rtnl_link_failed;
1815
1816        err = rtnl_link_register(&ipgre_tap_ops);
1817        if (err < 0)
1818                goto tap_ops_failed;
1819
1820out:
1821        return err;
1822
1823tap_ops_failed:
1824        rtnl_link_unregister(&ipgre_link_ops);
1825rtnl_link_failed:
1826        gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1827add_proto_failed:
1828        unregister_pernet_device(&ipgre_net_ops);
1829        goto out;
1830}
1831
1832static void __exit ipgre_fini(void)
1833{
1834        rtnl_link_unregister(&ipgre_tap_ops);
1835        rtnl_link_unregister(&ipgre_link_ops);
1836        if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1837                pr_info("%s: can't remove protocol\n", __func__);
1838        unregister_pernet_device(&ipgre_net_ops);
1839}
1840
1841module_init(ipgre_init);
1842module_exit(ipgre_fini);
1843MODULE_LICENSE("GPL");
1844MODULE_ALIAS_RTNL_LINK("gre");
1845MODULE_ALIAS_RTNL_LINK("gretap");
1846MODULE_ALIAS_NETDEV("gre0");
1847
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.