linux/net/ipv4/ipmr.c
<<
>>
Prefs
   1/*
   2 *      IP multicast routing support for mrouted 3.6/3.8
   3 *
   4 *              (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
   5 *        Linux Consultancy and Custom Driver Development
   6 *
   7 *      This program is free software; you can redistribute it and/or
   8 *      modify it under the terms of the GNU General Public License
   9 *      as published by the Free Software Foundation; either version
  10 *      2 of the License, or (at your option) any later version.
  11 *
  12 *      Fixes:
  13 *      Michael Chastain        :       Incorrect size of copying.
  14 *      Alan Cox                :       Added the cache manager code
  15 *      Alan Cox                :       Fixed the clone/copy bug and device race.
  16 *      Mike McLagan            :       Routing by source
  17 *      Malcolm Beattie         :       Buffer handling fixes.
  18 *      Alexey Kuznetsov        :       Double buffer free and other fixes.
  19 *      SVR Anand               :       Fixed several multicast bugs and problems.
  20 *      Alexey Kuznetsov        :       Status, optimisations and more.
  21 *      Brad Parker             :       Better behaviour on mrouted upcall
  22 *                                      overflow.
  23 *      Carlos Picoto           :       PIMv1 Support
  24 *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
  25 *                                      Relax this requirement to work with older peers.
  26 *
  27 */
  28
  29#include <asm/system.h>
  30#include <asm/uaccess.h>
  31#include <linux/types.h>
  32#include <linux/capability.h>
  33#include <linux/errno.h>
  34#include <linux/timer.h>
  35#include <linux/mm.h>
  36#include <linux/kernel.h>
  37#include <linux/fcntl.h>
  38#include <linux/stat.h>
  39#include <linux/socket.h>
  40#include <linux/in.h>
  41#include <linux/inet.h>
  42#include <linux/netdevice.h>
  43#include <linux/inetdevice.h>
  44#include <linux/igmp.h>
  45#include <linux/proc_fs.h>
  46#include <linux/seq_file.h>
  47#include <linux/mroute.h>
  48#include <linux/init.h>
  49#include <linux/if_ether.h>
  50#include <linux/slab.h>
  51#include <net/net_namespace.h>
  52#include <net/ip.h>
  53#include <net/protocol.h>
  54#include <linux/skbuff.h>
  55#include <net/route.h>
  56#include <net/sock.h>
  57#include <net/icmp.h>
  58#include <net/udp.h>
  59#include <net/raw.h>
  60#include <linux/notifier.h>
  61#include <linux/if_arp.h>
  62#include <linux/netfilter_ipv4.h>
  63#include <net/ipip.h>
  64#include <net/checksum.h>
  65#include <net/netlink.h>
  66#include <net/fib_rules.h>
  67
  68#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
  69#define CONFIG_IP_PIMSM 1
  70#endif
  71
  72struct mr_table {
  73        struct list_head        list;
  74#ifdef CONFIG_NET_NS
  75        struct net              *net;
  76#endif
  77        u32                     id;
  78        struct sock             *mroute_sk;
  79        struct timer_list       ipmr_expire_timer;
  80        struct list_head        mfc_unres_queue;
  81        struct list_head        mfc_cache_array[MFC_LINES];
  82        struct vif_device       vif_table[MAXVIFS];
  83        int                     maxvif;
  84        atomic_t                cache_resolve_queue_len;
  85        int                     mroute_do_assert;
  86        int                     mroute_do_pim;
  87#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
  88        int                     mroute_reg_vif_num;
  89#endif
  90};
  91
  92struct ipmr_rule {
  93        struct fib_rule         common;
  94};
  95
  96struct ipmr_result {
  97        struct mr_table         *mrt;
  98};
  99
 100/* Big lock, protecting vif table, mrt cache and mroute socket state.
 101   Note that the changes are semaphored via rtnl_lock.
 102 */
 103
 104static DEFINE_RWLOCK(mrt_lock);
 105
 106/*
 107 *      Multicast router control variables
 108 */
 109
 110#define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
 111
 112/* Special spinlock for queue of unresolved entries */
 113static DEFINE_SPINLOCK(mfc_unres_lock);
 114
 115/* We return to original Alan's scheme. Hash table of resolved
 116   entries is changed only in process context and protected
 117   with weak lock mrt_lock. Queue of unresolved entries is protected
 118   with strong spinlock mfc_unres_lock.
 119
 120   In this case data path is free of exclusive locks at all.
 121 */
 122
 123static struct kmem_cache *mrt_cachep __read_mostly;
 124
 125static struct mr_table *ipmr_new_table(struct net *net, u32 id);
 126static int ip_mr_forward(struct net *net, struct mr_table *mrt,
 127                         struct sk_buff *skb, struct mfc_cache *cache,
 128                         int local);
 129static int ipmr_cache_report(struct mr_table *mrt,
 130                             struct sk_buff *pkt, vifi_t vifi, int assert);
 131static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 132                              struct mfc_cache *c, struct rtmsg *rtm);
 133static void ipmr_expire_process(unsigned long arg);
 134
 135#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
 136#define ipmr_for_each_table(mrt, net) \
 137        list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list)
 138
 139static struct mr_table *ipmr_get_table(struct net *net, u32 id)
 140{
 141        struct mr_table *mrt;
 142
 143        ipmr_for_each_table(mrt, net) {
 144                if (mrt->id == id)
 145                        return mrt;
 146        }
 147        return NULL;
 148}
 149
 150static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
 151                           struct mr_table **mrt)
 152{
 153        struct ipmr_result res;
 154        struct fib_lookup_arg arg = { .result = &res, };
 155        int err;
 156
 157        err = fib_rules_lookup(net->ipv4.mr_rules_ops, flp, 0, &arg);
 158        if (err < 0)
 159                return err;
 160        *mrt = res.mrt;
 161        return 0;
 162}
 163
 164static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
 165                            int flags, struct fib_lookup_arg *arg)
 166{
 167        struct ipmr_result *res = arg->result;
 168        struct mr_table *mrt;
 169
 170        switch (rule->action) {
 171        case FR_ACT_TO_TBL:
 172                break;
 173        case FR_ACT_UNREACHABLE:
 174                return -ENETUNREACH;
 175        case FR_ACT_PROHIBIT:
 176                return -EACCES;
 177        case FR_ACT_BLACKHOLE:
 178        default:
 179                return -EINVAL;
 180        }
 181
 182        mrt = ipmr_get_table(rule->fr_net, rule->table);
 183        if (mrt == NULL)
 184                return -EAGAIN;
 185        res->mrt = mrt;
 186        return 0;
 187}
 188
 189static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
 190{
 191        return 1;
 192}
 193
 194static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
 195        FRA_GENERIC_POLICY,
 196};
 197
 198static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 199                               struct fib_rule_hdr *frh, struct nlattr **tb)
 200{
 201        return 0;
 202}
 203
 204static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
 205                             struct nlattr **tb)
 206{
 207        return 1;
 208}
 209
 210static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
 211                          struct fib_rule_hdr *frh)
 212{
 213        frh->dst_len = 0;
 214        frh->src_len = 0;
 215        frh->tos     = 0;
 216        return 0;
 217}
 218
 219static const struct fib_rules_ops __net_initdata ipmr_rules_ops_template = {
 220        .family         = RTNL_FAMILY_IPMR,
 221        .rule_size      = sizeof(struct ipmr_rule),
 222        .addr_size      = sizeof(u32),
 223        .action         = ipmr_rule_action,
 224        .match          = ipmr_rule_match,
 225        .configure      = ipmr_rule_configure,
 226        .compare        = ipmr_rule_compare,
 227        .default_pref   = fib_default_rule_pref,
 228        .fill           = ipmr_rule_fill,
 229        .nlgroup        = RTNLGRP_IPV4_RULE,
 230        .policy         = ipmr_rule_policy,
 231        .owner          = THIS_MODULE,
 232};
 233
 234static int __net_init ipmr_rules_init(struct net *net)
 235{
 236        struct fib_rules_ops *ops;
 237        struct mr_table *mrt;
 238        int err;
 239
 240        ops = fib_rules_register(&ipmr_rules_ops_template, net);
 241        if (IS_ERR(ops))
 242                return PTR_ERR(ops);
 243
 244        INIT_LIST_HEAD(&net->ipv4.mr_tables);
 245
 246        mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
 247        if (mrt == NULL) {
 248                err = -ENOMEM;
 249                goto err1;
 250        }
 251
 252        err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT, 0);
 253        if (err < 0)
 254                goto err2;
 255
 256        net->ipv4.mr_rules_ops = ops;
 257        return 0;
 258
 259err2:
 260        kfree(mrt);
 261err1:
 262        fib_rules_unregister(ops);
 263        return err;
 264}
 265
 266static void __net_exit ipmr_rules_exit(struct net *net)
 267{
 268        struct mr_table *mrt, *next;
 269
 270        list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
 271                list_del(&mrt->list);
 272                kfree(mrt);
 273        }
 274        fib_rules_unregister(net->ipv4.mr_rules_ops);
 275}
 276#else
 277#define ipmr_for_each_table(mrt, net) \
 278        for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
 279
 280static struct mr_table *ipmr_get_table(struct net *net, u32 id)
 281{
 282        return net->ipv4.mrt;
 283}
 284
 285static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
 286                           struct mr_table **mrt)
 287{
 288        *mrt = net->ipv4.mrt;
 289        return 0;
 290}
 291
 292static int __net_init ipmr_rules_init(struct net *net)
 293{
 294        net->ipv4.mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
 295        return net->ipv4.mrt ? 0 : -ENOMEM;
 296}
 297
 298static void __net_exit ipmr_rules_exit(struct net *net)
 299{
 300        kfree(net->ipv4.mrt);
 301}
 302#endif
 303
 304static struct mr_table *ipmr_new_table(struct net *net, u32 id)
 305{
 306        struct mr_table *mrt;
 307        unsigned int i;
 308
 309        mrt = ipmr_get_table(net, id);
 310        if (mrt != NULL)
 311                return mrt;
 312
 313        mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
 314        if (mrt == NULL)
 315                return NULL;
 316        write_pnet(&mrt->net, net);
 317        mrt->id = id;
 318
 319        /* Forwarding cache */
 320        for (i = 0; i < MFC_LINES; i++)
 321                INIT_LIST_HEAD(&mrt->mfc_cache_array[i]);
 322
 323        INIT_LIST_HEAD(&mrt->mfc_unres_queue);
 324
 325        setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
 326                    (unsigned long)mrt);
 327
 328#ifdef CONFIG_IP_PIMSM
 329        mrt->mroute_reg_vif_num = -1;
 330#endif
 331#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
 332        list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables);
 333#endif
 334        return mrt;
 335}
 336
 337/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
 338
 339static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
 340{
 341        struct net *net = dev_net(dev);
 342
 343        dev_close(dev);
 344
 345        dev = __dev_get_by_name(net, "tunl0");
 346        if (dev) {
 347                const struct net_device_ops *ops = dev->netdev_ops;
 348                struct ifreq ifr;
 349                struct ip_tunnel_parm p;
 350
 351                memset(&p, 0, sizeof(p));
 352                p.iph.daddr = v->vifc_rmt_addr.s_addr;
 353                p.iph.saddr = v->vifc_lcl_addr.s_addr;
 354                p.iph.version = 4;
 355                p.iph.ihl = 5;
 356                p.iph.protocol = IPPROTO_IPIP;
 357                sprintf(p.name, "dvmrp%d", v->vifc_vifi);
 358                ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
 359
 360                if (ops->ndo_do_ioctl) {
 361                        mm_segment_t oldfs = get_fs();
 362
 363                        set_fs(KERNEL_DS);
 364                        ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
 365                        set_fs(oldfs);
 366                }
 367        }
 368}
 369
 370static
 371struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
 372{
 373        struct net_device  *dev;
 374
 375        dev = __dev_get_by_name(net, "tunl0");
 376
 377        if (dev) {
 378                const struct net_device_ops *ops = dev->netdev_ops;
 379                int err;
 380                struct ifreq ifr;
 381                struct ip_tunnel_parm p;
 382                struct in_device  *in_dev;
 383
 384                memset(&p, 0, sizeof(p));
 385                p.iph.daddr = v->vifc_rmt_addr.s_addr;
 386                p.iph.saddr = v->vifc_lcl_addr.s_addr;
 387                p.iph.version = 4;
 388                p.iph.ihl = 5;
 389                p.iph.protocol = IPPROTO_IPIP;
 390                sprintf(p.name, "dvmrp%d", v->vifc_vifi);
 391                ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
 392
 393                if (ops->ndo_do_ioctl) {
 394                        mm_segment_t oldfs = get_fs();
 395
 396                        set_fs(KERNEL_DS);
 397                        err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
 398                        set_fs(oldfs);
 399                } else
 400                        err = -EOPNOTSUPP;
 401
 402                dev = NULL;
 403
 404                if (err == 0 &&
 405                    (dev = __dev_get_by_name(net, p.name)) != NULL) {
 406                        dev->flags |= IFF_MULTICAST;
 407
 408                        in_dev = __in_dev_get_rtnl(dev);
 409                        if (in_dev == NULL)
 410                                goto failure;
 411
 412                        ipv4_devconf_setall(in_dev);
 413                        IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
 414
 415                        if (dev_open(dev))
 416                                goto failure;
 417                        dev_hold(dev);
 418                }
 419        }
 420        return dev;
 421
 422failure:
 423        /* allow the register to be completed before unregistering. */
 424        rtnl_unlock();
 425        rtnl_lock();
 426
 427        unregister_netdevice(dev);
 428        return NULL;
 429}
 430
 431#ifdef CONFIG_IP_PIMSM
 432
 433static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
 434{
 435        struct net *net = dev_net(dev);
 436        struct mr_table *mrt;
 437        struct flowi fl = {
 438                .oif            = dev->ifindex,
 439                .iif            = skb->skb_iif,
 440                .mark           = skb->mark,
 441        };
 442        int err;
 443
 444        err = ipmr_fib_lookup(net, &fl, &mrt);
 445        if (err < 0) {
 446                kfree_skb(skb);
 447                return err;
 448        }
 449
 450        read_lock(&mrt_lock);
 451        dev->stats.tx_bytes += skb->len;
 452        dev->stats.tx_packets++;
 453        ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT);
 454        read_unlock(&mrt_lock);
 455        kfree_skb(skb);
 456        return NETDEV_TX_OK;
 457}
 458
 459static const struct net_device_ops reg_vif_netdev_ops = {
 460        .ndo_start_xmit = reg_vif_xmit,
 461};
 462
 463static void reg_vif_setup(struct net_device *dev)
 464{
 465        dev->type               = ARPHRD_PIMREG;
 466        dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
 467        dev->flags              = IFF_NOARP;
 468        dev->netdev_ops         = &reg_vif_netdev_ops,
 469        dev->destructor         = free_netdev;
 470        dev->features           |= NETIF_F_NETNS_LOCAL;
 471}
 472
 473static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
 474{
 475        struct net_device *dev;
 476        struct in_device *in_dev;
 477        char name[IFNAMSIZ];
 478
 479        if (mrt->id == RT_TABLE_DEFAULT)
 480                sprintf(name, "pimreg");
 481        else
 482                sprintf(name, "pimreg%u", mrt->id);
 483
 484        dev = alloc_netdev(0, name, reg_vif_setup);
 485
 486        if (dev == NULL)
 487                return NULL;
 488
 489        dev_net_set(dev, net);
 490
 491        if (register_netdevice(dev)) {
 492                free_netdev(dev);
 493                return NULL;
 494        }
 495        dev->iflink = 0;
 496
 497        rcu_read_lock();
 498        if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
 499                rcu_read_unlock();
 500                goto failure;
 501        }
 502
 503        ipv4_devconf_setall(in_dev);
 504        IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
 505        rcu_read_unlock();
 506
 507        if (dev_open(dev))
 508                goto failure;
 509
 510        dev_hold(dev);
 511
 512        return dev;
 513
 514failure:
 515        /* allow the register to be completed before unregistering. */
 516        rtnl_unlock();
 517        rtnl_lock();
 518
 519        unregister_netdevice(dev);
 520        return NULL;
 521}
 522#endif
 523
 524/*
 525 *      Delete a VIF entry
 526 *      @notify: Set to 1, if the caller is a notifier_call
 527 */
 528
 529static int vif_delete(struct mr_table *mrt, int vifi, int notify,
 530                      struct list_head *head)
 531{
 532        struct vif_device *v;
 533        struct net_device *dev;
 534        struct in_device *in_dev;
 535
 536        if (vifi < 0 || vifi >= mrt->maxvif)
 537                return -EADDRNOTAVAIL;
 538
 539        v = &mrt->vif_table[vifi];
 540
 541        write_lock_bh(&mrt_lock);
 542        dev = v->dev;
 543        v->dev = NULL;
 544
 545        if (!dev) {
 546                write_unlock_bh(&mrt_lock);
 547                return -EADDRNOTAVAIL;
 548        }
 549
 550#ifdef CONFIG_IP_PIMSM
 551        if (vifi == mrt->mroute_reg_vif_num)
 552                mrt->mroute_reg_vif_num = -1;
 553#endif
 554
 555        if (vifi+1 == mrt->maxvif) {
 556                int tmp;
 557                for (tmp=vifi-1; tmp>=0; tmp--) {
 558                        if (VIF_EXISTS(mrt, tmp))
 559                                break;
 560                }
 561                mrt->maxvif = tmp+1;
 562        }
 563
 564        write_unlock_bh(&mrt_lock);
 565
 566        dev_set_allmulti(dev, -1);
 567
 568        if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
 569                IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
 570                ip_rt_multicast_event(in_dev);
 571        }
 572
 573        if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
 574                unregister_netdevice_queue(dev, head);
 575
 576        dev_put(dev);
 577        return 0;
 578}
 579
 580static inline void ipmr_cache_free(struct mfc_cache *c)
 581{
 582        kmem_cache_free(mrt_cachep, c);
 583}
 584
 585/* Destroy an unresolved cache entry, killing queued skbs
 586   and reporting error to netlink readers.
 587 */
 588
 589static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
 590{
 591        struct net *net = read_pnet(&mrt->net);
 592        struct sk_buff *skb;
 593        struct nlmsgerr *e;
 594
 595        atomic_dec(&mrt->cache_resolve_queue_len);
 596
 597        while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
 598                if (ip_hdr(skb)->version == 0) {
 599                        struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
 600                        nlh->nlmsg_type = NLMSG_ERROR;
 601                        nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
 602                        skb_trim(skb, nlh->nlmsg_len);
 603                        e = NLMSG_DATA(nlh);
 604                        e->error = -ETIMEDOUT;
 605                        memset(&e->msg, 0, sizeof(e->msg));
 606
 607                        rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
 608                } else
 609                        kfree_skb(skb);
 610        }
 611
 612        ipmr_cache_free(c);
 613}
 614
 615
 616/* Timer process for the unresolved queue. */
 617
 618static void ipmr_expire_process(unsigned long arg)
 619{
 620        struct mr_table *mrt = (struct mr_table *)arg;
 621        unsigned long now;
 622        unsigned long expires;
 623        struct mfc_cache *c, *next;
 624
 625        if (!spin_trylock(&mfc_unres_lock)) {
 626                mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10);
 627                return;
 628        }
 629
 630        if (list_empty(&mrt->mfc_unres_queue))
 631                goto out;
 632
 633        now = jiffies;
 634        expires = 10*HZ;
 635
 636        list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
 637                if (time_after(c->mfc_un.unres.expires, now)) {
 638                        unsigned long interval = c->mfc_un.unres.expires - now;
 639                        if (interval < expires)
 640                                expires = interval;
 641                        continue;
 642                }
 643
 644                list_del(&c->list);
 645                ipmr_destroy_unres(mrt, c);
 646        }
 647
 648        if (!list_empty(&mrt->mfc_unres_queue))
 649                mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
 650
 651out:
 652        spin_unlock(&mfc_unres_lock);
 653}
 654
 655/* Fill oifs list. It is called under write locked mrt_lock. */
 656
 657static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache,
 658                                   unsigned char *ttls)
 659{
 660        int vifi;
 661
 662        cache->mfc_un.res.minvif = MAXVIFS;
 663        cache->mfc_un.res.maxvif = 0;
 664        memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
 665
 666        for (vifi = 0; vifi < mrt->maxvif; vifi++) {
 667                if (VIF_EXISTS(mrt, vifi) &&
 668                    ttls[vifi] && ttls[vifi] < 255) {
 669                        cache->mfc_un.res.ttls[vifi] = ttls[vifi];
 670                        if (cache->mfc_un.res.minvif > vifi)
 671                                cache->mfc_un.res.minvif = vifi;
 672                        if (cache->mfc_un.res.maxvif <= vifi)
 673                                cache->mfc_un.res.maxvif = vifi + 1;
 674                }
 675        }
 676}
 677
 678static int vif_add(struct net *net, struct mr_table *mrt,
 679                   struct vifctl *vifc, int mrtsock)
 680{
 681        int vifi = vifc->vifc_vifi;
 682        struct vif_device *v = &mrt->vif_table[vifi];
 683        struct net_device *dev;
 684        struct in_device *in_dev;
 685        int err;
 686
 687        /* Is vif busy ? */
 688        if (VIF_EXISTS(mrt, vifi))
 689                return -EADDRINUSE;
 690
 691        switch (vifc->vifc_flags) {
 692#ifdef CONFIG_IP_PIMSM
 693        case VIFF_REGISTER:
 694                /*
 695                 * Special Purpose VIF in PIM
 696                 * All the packets will be sent to the daemon
 697                 */
 698                if (mrt->mroute_reg_vif_num >= 0)
 699                        return -EADDRINUSE;
 700                dev = ipmr_reg_vif(net, mrt);
 701                if (!dev)
 702                        return -ENOBUFS;
 703                err = dev_set_allmulti(dev, 1);
 704                if (err) {
 705                        unregister_netdevice(dev);
 706                        dev_put(dev);
 707                        return err;
 708                }
 709                break;
 710#endif
 711        case VIFF_TUNNEL:
 712                dev = ipmr_new_tunnel(net, vifc);
 713                if (!dev)
 714                        return -ENOBUFS;
 715                err = dev_set_allmulti(dev, 1);
 716                if (err) {
 717                        ipmr_del_tunnel(dev, vifc);
 718                        dev_put(dev);
 719                        return err;
 720                }
 721                break;
 722
 723        case VIFF_USE_IFINDEX:
 724        case 0:
 725                if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
 726                        dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
 727                        if (dev && dev->ip_ptr == NULL) {
 728                                dev_put(dev);
 729                                return -EADDRNOTAVAIL;
 730                        }
 731                } else
 732                        dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
 733
 734                if (!dev)
 735                        return -EADDRNOTAVAIL;
 736                err = dev_set_allmulti(dev, 1);
 737                if (err) {
 738                        dev_put(dev);
 739                        return err;
 740                }
 741                break;
 742        default:
 743                return -EINVAL;
 744        }
 745
 746        if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) {
 747                dev_put(dev);
 748                return -EADDRNOTAVAIL;
 749        }
 750        IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
 751        ip_rt_multicast_event(in_dev);
 752
 753        /*
 754         *      Fill in the VIF structures
 755         */
 756        v->rate_limit = vifc->vifc_rate_limit;
 757        v->local = vifc->vifc_lcl_addr.s_addr;
 758        v->remote = vifc->vifc_rmt_addr.s_addr;
 759        v->flags = vifc->vifc_flags;
 760        if (!mrtsock)
 761                v->flags |= VIFF_STATIC;
 762        v->threshold = vifc->vifc_threshold;
 763        v->bytes_in = 0;
 764        v->bytes_out = 0;
 765        v->pkt_in = 0;
 766        v->pkt_out = 0;
 767        v->link = dev->ifindex;
 768        if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
 769                v->link = dev->iflink;
 770
 771        /* And finish update writing critical data */
 772        write_lock_bh(&mrt_lock);
 773        v->dev = dev;
 774#ifdef CONFIG_IP_PIMSM
 775        if (v->flags&VIFF_REGISTER)
 776                mrt->mroute_reg_vif_num = vifi;
 777#endif
 778        if (vifi+1 > mrt->maxvif)
 779                mrt->maxvif = vifi+1;
 780        write_unlock_bh(&mrt_lock);
 781        return 0;
 782}
 783
 784static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
 785                                         __be32 origin,
 786                                         __be32 mcastgrp)
 787{
 788        int line = MFC_HASH(mcastgrp, origin);
 789        struct mfc_cache *c;
 790
 791        list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
 792                if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
 793                        return c;
 794        }
 795        return NULL;
 796}
 797
 798/*
 799 *      Allocate a multicast cache entry
 800 */
 801static struct mfc_cache *ipmr_cache_alloc(void)
 802{
 803        struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
 804        if (c == NULL)
 805                return NULL;
 806        c->mfc_un.res.minvif = MAXVIFS;
 807        return c;
 808}
 809
 810static struct mfc_cache *ipmr_cache_alloc_unres(void)
 811{
 812        struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
 813        if (c == NULL)
 814                return NULL;
 815        skb_queue_head_init(&c->mfc_un.unres.unresolved);
 816        c->mfc_un.unres.expires = jiffies + 10*HZ;
 817        return c;
 818}
 819
 820/*
 821 *      A cache entry has gone into a resolved state from queued
 822 */
 823
 824static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
 825                               struct mfc_cache *uc, struct mfc_cache *c)
 826{
 827        struct sk_buff *skb;
 828        struct nlmsgerr *e;
 829
 830        /*
 831         *      Play the pending entries through our router
 832         */
 833
 834        while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
 835                if (ip_hdr(skb)->version == 0) {
 836                        struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
 837
 838                        if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
 839                                nlh->nlmsg_len = (skb_tail_pointer(skb) -
 840                                                  (u8 *)nlh);
 841                        } else {
 842                                nlh->nlmsg_type = NLMSG_ERROR;
 843                                nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
 844                                skb_trim(skb, nlh->nlmsg_len);
 845                                e = NLMSG_DATA(nlh);
 846                                e->error = -EMSGSIZE;
 847                                memset(&e->msg, 0, sizeof(e->msg));
 848                        }
 849
 850                        rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
 851                } else
 852                        ip_mr_forward(net, mrt, skb, c, 0);
 853        }
 854}
 855
 856/*
 857 *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
 858 *      expects the following bizarre scheme.
 859 *
 860 *      Called under mrt_lock.
 861 */
 862
 863static int ipmr_cache_report(struct mr_table *mrt,
 864                             struct sk_buff *pkt, vifi_t vifi, int assert)
 865{
 866        struct sk_buff *skb;
 867        const int ihl = ip_hdrlen(pkt);
 868        struct igmphdr *igmp;
 869        struct igmpmsg *msg;
 870        int ret;
 871
 872#ifdef CONFIG_IP_PIMSM
 873        if (assert == IGMPMSG_WHOLEPKT)
 874                skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
 875        else
 876#endif
 877                skb = alloc_skb(128, GFP_ATOMIC);
 878
 879        if (!skb)
 880                return -ENOBUFS;
 881
 882#ifdef CONFIG_IP_PIMSM
 883        if (assert == IGMPMSG_WHOLEPKT) {
 884                /* Ugly, but we have no choice with this interface.
 885                   Duplicate old header, fix ihl, length etc.
 886                   And all this only to mangle msg->im_msgtype and
 887                   to set msg->im_mbz to "mbz" :-)
 888                 */
 889                skb_push(skb, sizeof(struct iphdr));
 890                skb_reset_network_header(skb);
 891                skb_reset_transport_header(skb);
 892                msg = (struct igmpmsg *)skb_network_header(skb);
 893                memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
 894                msg->im_msgtype = IGMPMSG_WHOLEPKT;
 895                msg->im_mbz = 0;
 896                msg->im_vif = mrt->mroute_reg_vif_num;
 897                ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
 898                ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
 899                                             sizeof(struct iphdr));
 900        } else
 901#endif
 902        {
 903
 904        /*
 905         *      Copy the IP header
 906         */
 907
 908        skb->network_header = skb->tail;
 909        skb_put(skb, ihl);
 910        skb_copy_to_linear_data(skb, pkt->data, ihl);
 911        ip_hdr(skb)->protocol = 0;                      /* Flag to the kernel this is a route add */
 912        msg = (struct igmpmsg *)skb_network_header(skb);
 913        msg->im_vif = vifi;
 914        skb_dst_set(skb, dst_clone(skb_dst(pkt)));
 915
 916        /*
 917         *      Add our header
 918         */
 919
 920        igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
 921        igmp->type      =
 922        msg->im_msgtype = assert;
 923        igmp->code      =       0;
 924        ip_hdr(skb)->tot_len = htons(skb->len);                 /* Fix the length */
 925        skb->transport_header = skb->network_header;
 926        }
 927
 928        if (mrt->mroute_sk == NULL) {
 929                kfree_skb(skb);
 930                return -EINVAL;
 931        }
 932
 933        /*
 934         *      Deliver to mrouted
 935         */
 936        ret = sock_queue_rcv_skb(mrt->mroute_sk, skb);
 937        if (ret < 0) {
 938                if (net_ratelimit())
 939                        printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
 940                kfree_skb(skb);
 941        }
 942
 943        return ret;
 944}
 945
 946/*
 947 *      Queue a packet for resolution. It gets locked cache entry!
 948 */
 949
 950static int
 951ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
 952{
 953        bool found = false;
 954        int err;
 955        struct mfc_cache *c;
 956        const struct iphdr *iph = ip_hdr(skb);
 957
 958        spin_lock_bh(&mfc_unres_lock);
 959        list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
 960                if (c->mfc_mcastgrp == iph->daddr &&
 961                    c->mfc_origin == iph->saddr) {
 962                        found = true;
 963                        break;
 964                }
 965        }
 966
 967        if (!found) {
 968                /*
 969                 *      Create a new entry if allowable
 970                 */
 971
 972                if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
 973                    (c = ipmr_cache_alloc_unres()) == NULL) {
 974                        spin_unlock_bh(&mfc_unres_lock);
 975
 976                        kfree_skb(skb);
 977                        return -ENOBUFS;
 978                }
 979
 980                /*
 981                 *      Fill in the new cache entry
 982                 */
 983                c->mfc_parent   = -1;
 984                c->mfc_origin   = iph->saddr;
 985                c->mfc_mcastgrp = iph->daddr;
 986
 987                /*
 988                 *      Reflect first query at mrouted.
 989                 */
 990                err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
 991                if (err < 0) {
 992                        /* If the report failed throw the cache entry
 993                           out - Brad Parker
 994                         */
 995                        spin_unlock_bh(&mfc_unres_lock);
 996
 997                        ipmr_cache_free(c);
 998                        kfree_skb(skb);
 999                        return err;
1000                }
1001
1002                atomic_inc(&mrt->cache_resolve_queue_len);
1003                list_add(&c->list, &mrt->mfc_unres_queue);
1004
1005                if (atomic_read(&mrt->cache_resolve_queue_len) == 1)
1006                        mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
1007        }
1008
1009        /*
1010         *      See if we can append the packet
1011         */
1012        if (c->mfc_un.unres.unresolved.qlen>3) {
1013                kfree_skb(skb);
1014                err = -ENOBUFS;
1015        } else {
1016                skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
1017                err = 0;
1018        }
1019
1020        spin_unlock_bh(&mfc_unres_lock);
1021        return err;
1022}
1023
1024/*
1025 *      MFC cache manipulation by user space mroute daemon
1026 */
1027
1028static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
1029{
1030        int line;
1031        struct mfc_cache *c, *next;
1032
1033        line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
1034
1035        list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
1036                if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
1037                    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
1038                        write_lock_bh(&mrt_lock);
1039                        list_del(&c->list);
1040                        write_unlock_bh(&mrt_lock);
1041
1042                        ipmr_cache_free(c);
1043                        return 0;
1044                }
1045        }
1046        return -ENOENT;
1047}
1048
1049static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
1050                        struct mfcctl *mfc, int mrtsock)
1051{
1052        bool found = false;
1053        int line;
1054        struct mfc_cache *uc, *c;
1055
1056        if (mfc->mfcc_parent >= MAXVIFS)
1057                return -ENFILE;
1058
1059        line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
1060
1061        list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
1062                if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
1063                    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
1064                        found = true;
1065                        break;
1066                }
1067        }
1068
1069        if (found) {
1070                write_lock_bh(&mrt_lock);
1071                c->mfc_parent = mfc->mfcc_parent;
1072                ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
1073                if (!mrtsock)
1074                        c->mfc_flags |= MFC_STATIC;
1075                write_unlock_bh(&mrt_lock);
1076                return 0;
1077        }
1078
1079        if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
1080                return -EINVAL;
1081
1082        c = ipmr_cache_alloc();
1083        if (c == NULL)
1084                return -ENOMEM;
1085
1086        c->mfc_origin = mfc->mfcc_origin.s_addr;
1087        c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
1088        c->mfc_parent = mfc->mfcc_parent;
1089        ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
1090        if (!mrtsock)
1091                c->mfc_flags |= MFC_STATIC;
1092
1093        write_lock_bh(&mrt_lock);
1094        list_add(&c->list, &mrt->mfc_cache_array[line]);
1095        write_unlock_bh(&mrt_lock);
1096
1097        /*
1098         *      Check to see if we resolved a queued list. If so we
1099         *      need to send on the frames and tidy up.
1100         */
1101        found = false;
1102        spin_lock_bh(&mfc_unres_lock);
1103        list_for_each_entry(uc, &mrt->mfc_unres_queue, list) {
1104                if (uc->mfc_origin == c->mfc_origin &&
1105                    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
1106                        list_del(&uc->list);
1107                        atomic_dec(&mrt->cache_resolve_queue_len);
1108                        found = true;
1109                        break;
1110                }
1111        }
1112        if (list_empty(&mrt->mfc_unres_queue))
1113                del_timer(&mrt->ipmr_expire_timer);
1114        spin_unlock_bh(&mfc_unres_lock);
1115
1116        if (found) {
1117                ipmr_cache_resolve(net, mrt, uc, c);
1118                ipmr_cache_free(uc);
1119        }
1120        return 0;
1121}
1122
1123/*
1124 *      Close the multicast socket, and clear the vif tables etc
1125 */
1126
1127static void mroute_clean_tables(struct mr_table *mrt)
1128{
1129        int i;
1130        LIST_HEAD(list);
1131        struct mfc_cache *c, *next;
1132
1133        /*
1134         *      Shut down all active vif entries
1135         */
1136        for (i = 0; i < mrt->maxvif; i++) {
1137                if (!(mrt->vif_table[i].flags&VIFF_STATIC))
1138                        vif_delete(mrt, i, 0, &list);
1139        }
1140        unregister_netdevice_many(&list);
1141
1142        /*
1143         *      Wipe the cache
1144         */
1145        for (i = 0; i < MFC_LINES; i++) {
1146                list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
1147                        if (c->mfc_flags&MFC_STATIC)
1148                                continue;
1149                        write_lock_bh(&mrt_lock);
1150                        list_del(&c->list);
1151                        write_unlock_bh(&mrt_lock);
1152
1153                        ipmr_cache_free(c);
1154                }
1155        }
1156
1157        if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
1158                spin_lock_bh(&mfc_unres_lock);
1159                list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
1160                        list_del(&c->list);
1161                        ipmr_destroy_unres(mrt, c);
1162                }
1163                spin_unlock_bh(&mfc_unres_lock);
1164        }
1165}
1166
1167static void mrtsock_destruct(struct sock *sk)
1168{
1169        struct net *net = sock_net(sk);
1170        struct mr_table *mrt;
1171
1172        rtnl_lock();
1173        ipmr_for_each_table(mrt, net) {
1174                if (sk == mrt->mroute_sk) {
1175                        IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
1176
1177                        write_lock_bh(&mrt_lock);
1178                        mrt->mroute_sk = NULL;
1179                        write_unlock_bh(&mrt_lock);
1180
1181                        mroute_clean_tables(mrt);
1182                }
1183        }
1184        rtnl_unlock();
1185}
1186
1187/*
1188 *      Socket options and virtual interface manipulation. The whole
1189 *      virtual interface system is a complete heap, but unfortunately
1190 *      that's how BSD mrouted happens to think. Maybe one day with a proper
1191 *      MOSPF/PIM router set up we can clean this up.
1192 */
1193
1194int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen)
1195{
1196        int ret;
1197        struct vifctl vif;
1198        struct mfcctl mfc;
1199        struct net *net = sock_net(sk);
1200        struct mr_table *mrt;
1201
1202        mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1203        if (mrt == NULL)
1204                return -ENOENT;
1205
1206        if (optname != MRT_INIT) {
1207                if (sk != mrt->mroute_sk && !capable(CAP_NET_ADMIN))
1208                        return -EACCES;
1209        }
1210
1211        switch (optname) {
1212        case MRT_INIT:
1213                if (sk->sk_type != SOCK_RAW ||
1214                    inet_sk(sk)->inet_num != IPPROTO_IGMP)
1215                        return -EOPNOTSUPP;
1216                if (optlen != sizeof(int))
1217                        return -ENOPROTOOPT;
1218
1219                rtnl_lock();
1220                if (mrt->mroute_sk) {
1221                        rtnl_unlock();
1222                        return -EADDRINUSE;
1223                }
1224
1225                ret = ip_ra_control(sk, 1, mrtsock_destruct);
1226                if (ret == 0) {
1227                        write_lock_bh(&mrt_lock);
1228                        mrt->mroute_sk = sk;
1229                        write_unlock_bh(&mrt_lock);
1230
1231                        IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
1232                }
1233                rtnl_unlock();
1234                return ret;
1235        case MRT_DONE:
1236                if (sk != mrt->mroute_sk)
1237                        return -EACCES;
1238                return ip_ra_control(sk, 0, NULL);
1239        case MRT_ADD_VIF:
1240        case MRT_DEL_VIF:
1241                if (optlen != sizeof(vif))
1242                        return -EINVAL;
1243                if (copy_from_user(&vif, optval, sizeof(vif)))
1244                        return -EFAULT;
1245                if (vif.vifc_vifi >= MAXVIFS)
1246                        return -ENFILE;
1247                rtnl_lock();
1248                if (optname == MRT_ADD_VIF) {
1249                        ret = vif_add(net, mrt, &vif, sk == mrt->mroute_sk);
1250                } else {
1251                        ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
1252                }
1253                rtnl_unlock();
1254                return ret;
1255
1256                /*
1257                 *      Manipulate the forwarding caches. These live
1258                 *      in a sort of kernel/user symbiosis.
1259                 */
1260        case MRT_ADD_MFC:
1261        case MRT_DEL_MFC:
1262                if (optlen != sizeof(mfc))
1263                        return -EINVAL;
1264                if (copy_from_user(&mfc, optval, sizeof(mfc)))
1265                        return -EFAULT;
1266                rtnl_lock();
1267                if (optname == MRT_DEL_MFC)
1268                        ret = ipmr_mfc_delete(mrt, &mfc);
1269                else
1270                        ret = ipmr_mfc_add(net, mrt, &mfc, sk == mrt->mroute_sk);
1271                rtnl_unlock();
1272                return ret;
1273                /*
1274                 *      Control PIM assert.
1275                 */
1276        case MRT_ASSERT:
1277        {
1278                int v;
1279                if (get_user(v,(int __user *)optval))
1280                        return -EFAULT;
1281                mrt->mroute_do_assert = (v) ? 1 : 0;
1282                return 0;
1283        }
1284#ifdef CONFIG_IP_PIMSM
1285        case MRT_PIM:
1286        {
1287                int v;
1288
1289                if (get_user(v,(int __user *)optval))
1290                        return -EFAULT;
1291                v = (v) ? 1 : 0;
1292
1293                rtnl_lock();
1294                ret = 0;
1295                if (v != mrt->mroute_do_pim) {
1296                        mrt->mroute_do_pim = v;
1297                        mrt->mroute_do_assert = v;
1298                }
1299                rtnl_unlock();
1300                return ret;
1301        }
1302#endif
1303#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
1304        case MRT_TABLE:
1305        {
1306                u32 v;
1307
1308                if (optlen != sizeof(u32))
1309                        return -EINVAL;
1310                if (get_user(v, (u32 __user *)optval))
1311                        return -EFAULT;
1312                if (sk == mrt->mroute_sk)
1313                        return -EBUSY;
1314
1315                rtnl_lock();
1316                ret = 0;
1317                if (!ipmr_new_table(net, v))
1318                        ret = -ENOMEM;
1319                raw_sk(sk)->ipmr_table = v;
1320                rtnl_unlock();
1321                return ret;
1322        }
1323#endif
1324        /*
1325         *      Spurious command, or MRT_VERSION which you cannot
1326         *      set.
1327         */
1328        default:
1329                return -ENOPROTOOPT;
1330        }
1331}
1332
1333/*
1334 *      Getsock opt support for the multicast routing system.
1335 */
1336
1337int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
1338{
1339        int olr;
1340        int val;
1341        struct net *net = sock_net(sk);
1342        struct mr_table *mrt;
1343
1344        mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1345        if (mrt == NULL)
1346                return -ENOENT;
1347
1348        if (optname != MRT_VERSION &&
1349#ifdef CONFIG_IP_PIMSM
1350           optname!=MRT_PIM &&
1351#endif
1352           optname!=MRT_ASSERT)
1353                return -ENOPROTOOPT;
1354
1355        if (get_user(olr, optlen))
1356                return -EFAULT;
1357
1358        olr = min_t(unsigned int, olr, sizeof(int));
1359        if (olr < 0)
1360                return -EINVAL;
1361
1362        if (put_user(olr, optlen))
1363                return -EFAULT;
1364        if (optname == MRT_VERSION)
1365                val = 0x0305;
1366#ifdef CONFIG_IP_PIMSM
1367        else if (optname == MRT_PIM)
1368                val = mrt->mroute_do_pim;
1369#endif
1370        else
1371                val = mrt->mroute_do_assert;
1372        if (copy_to_user(optval, &val, olr))
1373                return -EFAULT;
1374        return 0;
1375}
1376
1377/*
1378 *      The IP multicast ioctl support routines.
1379 */
1380
1381int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1382{
1383        struct sioc_sg_req sr;
1384        struct sioc_vif_req vr;
1385        struct vif_device *vif;
1386        struct mfc_cache *c;
1387        struct net *net = sock_net(sk);
1388        struct mr_table *mrt;
1389
1390        mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1391        if (mrt == NULL)
1392                return -ENOENT;
1393
1394        switch (cmd) {
1395        case SIOCGETVIFCNT:
1396                if (copy_from_user(&vr, arg, sizeof(vr)))
1397                        return -EFAULT;
1398                if (vr.vifi >= mrt->maxvif)
1399                        return -EINVAL;
1400                read_lock(&mrt_lock);
1401                vif = &mrt->vif_table[vr.vifi];
1402                if (VIF_EXISTS(mrt, vr.vifi)) {
1403                        vr.icount = vif->pkt_in;
1404                        vr.ocount = vif->pkt_out;
1405                        vr.ibytes = vif->bytes_in;
1406                        vr.obytes = vif->bytes_out;
1407                        read_unlock(&mrt_lock);
1408
1409                        if (copy_to_user(arg, &vr, sizeof(vr)))
1410                                return -EFAULT;
1411                        return 0;
1412                }
1413                read_unlock(&mrt_lock);
1414                return -EADDRNOTAVAIL;
1415        case SIOCGETSGCNT:
1416                if (copy_from_user(&sr, arg, sizeof(sr)))
1417                        return -EFAULT;
1418
1419                read_lock(&mrt_lock);
1420                c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
1421                if (c) {
1422                        sr.pktcnt = c->mfc_un.res.pkt;
1423                        sr.bytecnt = c->mfc_un.res.bytes;
1424                        sr.wrong_if = c->mfc_un.res.wrong_if;
1425                        read_unlock(&mrt_lock);
1426
1427                        if (copy_to_user(arg, &sr, sizeof(sr)))
1428                                return -EFAULT;
1429                        return 0;
1430                }
1431                read_unlock(&mrt_lock);
1432                return -EADDRNOTAVAIL;
1433        default:
1434                return -ENOIOCTLCMD;
1435        }
1436}
1437
1438
1439static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1440{
1441        struct net_device *dev = ptr;
1442        struct net *net = dev_net(dev);
1443        struct mr_table *mrt;
1444        struct vif_device *v;
1445        int ct;
1446        LIST_HEAD(list);
1447
1448        if (event != NETDEV_UNREGISTER)
1449                return NOTIFY_DONE;
1450
1451        ipmr_for_each_table(mrt, net) {
1452                v = &mrt->vif_table[0];
1453                for (ct = 0; ct < mrt->maxvif; ct++, v++) {
1454                        if (v->dev == dev)
1455                                vif_delete(mrt, ct, 1, &list);
1456                }
1457        }
1458        unregister_netdevice_many(&list);
1459        return NOTIFY_DONE;
1460}
1461
1462
1463static struct notifier_block ip_mr_notifier = {
1464        .notifier_call = ipmr_device_event,
1465};
1466
1467/*
1468 *      Encapsulate a packet by attaching a valid IPIP header to it.
1469 *      This avoids tunnel drivers and other mess and gives us the speed so
1470 *      important for multicast video.
1471 */
1472
1473static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1474{
1475        struct iphdr *iph;
1476        struct iphdr *old_iph = ip_hdr(skb);
1477
1478        skb_push(skb, sizeof(struct iphdr));
1479        skb->transport_header = skb->network_header;
1480        skb_reset_network_header(skb);
1481        iph = ip_hdr(skb);
1482
1483        iph->version    =       4;
1484        iph->tos        =       old_iph->tos;
1485        iph->ttl        =       old_iph->ttl;
1486        iph->frag_off   =       0;
1487        iph->daddr      =       daddr;
1488        iph->saddr      =       saddr;
1489        iph->protocol   =       IPPROTO_IPIP;
1490        iph->ihl        =       5;
1491        iph->tot_len    =       htons(skb->len);
1492        ip_select_ident(iph, skb_dst(skb), NULL);
1493        ip_send_check(iph);
1494
1495        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1496        nf_reset(skb);
1497}
1498
1499static inline int ipmr_forward_finish(struct sk_buff *skb)
1500{
1501        struct ip_options * opt = &(IPCB(skb)->opt);
1502
1503        IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
1504
1505        if (unlikely(opt->optlen))
1506                ip_forward_options(skb);
1507
1508        return dst_output(skb);
1509}
1510
1511/*
1512 *      Processing handlers for ipmr_forward
1513 */
1514
1515static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1516                            struct sk_buff *skb, struct mfc_cache *c, int vifi)
1517{
1518        const struct iphdr *iph = ip_hdr(skb);
1519        struct vif_device *vif = &mrt->vif_table[vifi];
1520        struct net_device *dev;
1521        struct rtable *rt;
1522        int    encap = 0;
1523
1524        if (vif->dev == NULL)
1525                goto out_free;
1526
1527#ifdef CONFIG_IP_PIMSM
1528        if (vif->flags & VIFF_REGISTER) {
1529                vif->pkt_out++;
1530                vif->bytes_out += skb->len;
1531                vif->dev->stats.tx_bytes += skb->len;
1532                vif->dev->stats.tx_packets++;
1533                ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
1534                goto out_free;
1535        }
1536#endif
1537
1538        if (vif->flags&VIFF_TUNNEL) {
1539                struct flowi fl = { .oif = vif->link,
1540                                    .nl_u = { .ip4_u =
1541                                              { .daddr = vif->remote,
1542                                                .saddr = vif->local,
1543                                                .tos = RT_TOS(iph->tos) } },
1544                                    .proto = IPPROTO_IPIP };
1545                if (ip_route_output_key(net, &rt, &fl))
1546                        goto out_free;
1547                encap = sizeof(struct iphdr);
1548        } else {
1549                struct flowi fl = { .oif = vif->link,
1550                                    .nl_u = { .ip4_u =
1551                                              { .daddr = iph->daddr,
1552                                                .tos = RT_TOS(iph->tos) } },
1553                                    .proto = IPPROTO_IPIP };
1554                if (ip_route_output_key(net, &rt, &fl))
1555                        goto out_free;
1556        }
1557
1558        dev = rt->u.dst.dev;
1559
1560        if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1561                /* Do not fragment multicasts. Alas, IPv4 does not
1562                   allow to send ICMP, so that packets will disappear
1563                   to blackhole.
1564                 */
1565
1566                IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1567                ip_rt_put(rt);
1568                goto out_free;
1569        }
1570
1571        encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1572
1573        if (skb_cow(skb, encap)) {
1574                ip_rt_put(rt);
1575                goto out_free;
1576        }
1577
1578        vif->pkt_out++;
1579        vif->bytes_out += skb->len;
1580
1581        skb_dst_drop(skb);
1582        skb_dst_set(skb, &rt->u.dst);
1583        ip_decrease_ttl(ip_hdr(skb));
1584
1585        /* FIXME: forward and output firewalls used to be called here.
1586         * What do we do with netfilter? -- RR */
1587        if (vif->flags & VIFF_TUNNEL) {
1588                ip_encap(skb, vif->local, vif->remote);
1589                /* FIXME: extra output firewall step used to be here. --RR */
1590                vif->dev->stats.tx_packets++;
1591                vif->dev->stats.tx_bytes += skb->len;
1592        }
1593
1594        IPCB(skb)->flags |= IPSKB_FORWARDED;
1595
1596        /*
1597         * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1598         * not only before forwarding, but after forwarding on all output
1599         * interfaces. It is clear, if mrouter runs a multicasting
1600         * program, it should receive packets not depending to what interface
1601         * program is joined.
1602         * If we will not make it, the program will have to join on all
1603         * interfaces. On the other hand, multihoming host (or router, but
1604         * not mrouter) cannot join to more than one interface - it will
1605         * result in receiving multiple packets.
1606         */
1607        NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, dev,
1608                ipmr_forward_finish);
1609        return;
1610
1611out_free:
1612        kfree_skb(skb);
1613}
1614
1615static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
1616{
1617        int ct;
1618
1619        for (ct = mrt->maxvif-1; ct >= 0; ct--) {
1620                if (mrt->vif_table[ct].dev == dev)
1621                        break;
1622        }
1623        return ct;
1624}
1625
1626/* "local" means that we should preserve one skb (for local delivery) */
1627
1628static int ip_mr_forward(struct net *net, struct mr_table *mrt,
1629                         struct sk_buff *skb, struct mfc_cache *cache,
1630                         int local)
1631{
1632        int psend = -1;
1633        int vif, ct;
1634
1635        vif = cache->mfc_parent;
1636        cache->mfc_un.res.pkt++;
1637        cache->mfc_un.res.bytes += skb->len;
1638
1639        /*
1640         * Wrong interface: drop packet and (maybe) send PIM assert.
1641         */
1642        if (mrt->vif_table[vif].dev != skb->dev) {
1643                int true_vifi;
1644
1645                if (skb_rtable(skb)->fl.iif == 0) {
1646                        /* It is our own packet, looped back.
1647                           Very complicated situation...
1648
1649                           The best workaround until routing daemons will be
1650                           fixed is not to redistribute packet, if it was
1651                           send through wrong interface. It means, that
1652                           multicast applications WILL NOT work for
1653                           (S,G), which have default multicast route pointing
1654                           to wrong oif. In any case, it is not a good
1655                           idea to use multicasting applications on router.
1656                         */
1657                        goto dont_forward;
1658                }
1659
1660                cache->mfc_un.res.wrong_if++;
1661                true_vifi = ipmr_find_vif(mrt, skb->dev);
1662
1663                if (true_vifi >= 0 && mrt->mroute_do_assert &&
1664                    /* pimsm uses asserts, when switching from RPT to SPT,
1665                       so that we cannot check that packet arrived on an oif.
1666                       It is bad, but otherwise we would need to move pretty
1667                       large chunk of pimd to kernel. Ough... --ANK
1668                     */
1669                    (mrt->mroute_do_pim ||
1670                     cache->mfc_un.res.ttls[true_vifi] < 255) &&
1671                    time_after(jiffies,
1672                               cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1673                        cache->mfc_un.res.last_assert = jiffies;
1674                        ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
1675                }
1676                goto dont_forward;
1677        }
1678
1679        mrt->vif_table[vif].pkt_in++;
1680        mrt->vif_table[vif].bytes_in += skb->len;
1681
1682        /*
1683         *      Forward the frame
1684         */
1685        for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1686                if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1687                        if (psend != -1) {
1688                                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1689                                if (skb2)
1690                                        ipmr_queue_xmit(net, mrt, skb2, cache,
1691                                                        psend);
1692                        }
1693                        psend = ct;
1694                }
1695        }
1696        if (psend != -1) {
1697                if (local) {
1698                        struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1699                        if (skb2)
1700                                ipmr_queue_xmit(net, mrt, skb2, cache, psend);
1701                } else {
1702                        ipmr_queue_xmit(net, mrt, skb, cache, psend);
1703                        return 0;
1704                }
1705        }
1706
1707dont_forward:
1708        if (!local)
1709                kfree_skb(skb);
1710        return 0;
1711}
1712
1713
1714/*
1715 *      Multicast packets for forwarding arrive here
1716 */
1717
1718int ip_mr_input(struct sk_buff *skb)
1719{
1720        struct mfc_cache *cache;
1721        struct net *net = dev_net(skb->dev);
1722        int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
1723        struct mr_table *mrt;
1724        int err;
1725
1726        /* Packet is looped back after forward, it should not be
1727           forwarded second time, but still can be delivered locally.
1728         */
1729        if (IPCB(skb)->flags&IPSKB_FORWARDED)
1730                goto dont_forward;
1731
1732        err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt);
1733        if (err < 0) {
1734                kfree_skb(skb);
1735                return err;
1736        }
1737
1738        if (!local) {
1739                    if (IPCB(skb)->opt.router_alert) {
1740                            if (ip_call_ra_chain(skb))
1741                                    return 0;
1742                    } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1743                            /* IGMPv1 (and broken IGMPv2 implementations sort of
1744                               Cisco IOS <= 11.2(8)) do not put router alert
1745                               option to IGMP packets destined to routable
1746                               groups. It is very bad, because it means
1747                               that we can forward NO IGMP messages.
1748                             */
1749                            read_lock(&mrt_lock);
1750                            if (mrt->mroute_sk) {
1751                                    nf_reset(skb);
1752                                    raw_rcv(mrt->mroute_sk, skb);
1753                                    read_unlock(&mrt_lock);
1754                                    return 0;
1755                            }
1756                            read_unlock(&mrt_lock);
1757                    }
1758        }
1759
1760        read_lock(&mrt_lock);
1761        cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1762
1763        /*
1764         *      No usable cache entry
1765         */
1766        if (cache == NULL) {
1767                int vif;
1768
1769                if (local) {
1770                        struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1771                        ip_local_deliver(skb);
1772                        if (skb2 == NULL) {
1773                                read_unlock(&mrt_lock);
1774                                return -ENOBUFS;
1775                        }
1776                        skb = skb2;
1777                }
1778
1779                vif = ipmr_find_vif(mrt, skb->dev);
1780                if (vif >= 0) {
1781                        int err2 = ipmr_cache_unresolved(mrt, vif, skb);
1782                        read_unlock(&mrt_lock);
1783
1784                        return err2;
1785                }
1786                read_unlock(&mrt_lock);
1787                kfree_skb(skb);
1788                return -ENODEV;
1789        }
1790
1791        ip_mr_forward(net, mrt, skb, cache, local);
1792
1793        read_unlock(&mrt_lock);
1794
1795        if (local)
1796                return ip_local_deliver(skb);
1797
1798        return 0;
1799
1800dont_forward:
1801        if (local)
1802                return ip_local_deliver(skb);
1803        kfree_skb(skb);
1804        return 0;
1805}
1806
1807#ifdef CONFIG_IP_PIMSM
1808static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
1809                     unsigned int pimlen)
1810{
1811        struct net_device *reg_dev = NULL;
1812        struct iphdr *encap;
1813
1814        encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
1815        /*
1816           Check that:
1817           a. packet is really destinted to a multicast group
1818           b. packet is not a NULL-REGISTER
1819           c. packet is not truncated
1820         */
1821        if (!ipv4_is_multicast(encap->daddr) ||
1822            encap->tot_len == 0 ||
1823            ntohs(encap->tot_len) + pimlen > skb->len)
1824                return 1;
1825
1826        read_lock(&mrt_lock);
1827        if (mrt->mroute_reg_vif_num >= 0)
1828                reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
1829        if (reg_dev)
1830                dev_hold(reg_dev);
1831        read_unlock(&mrt_lock);
1832
1833        if (reg_dev == NULL)
1834                return 1;
1835
1836        skb->mac_header = skb->network_header;
1837        skb_pull(skb, (u8*)encap - skb->data);
1838        skb_reset_network_header(skb);
1839        skb->protocol = htons(ETH_P_IP);
1840        skb->ip_summed = 0;
1841        skb->pkt_type = PACKET_HOST;
1842
1843        skb_tunnel_rx(skb, reg_dev);
1844
1845        netif_rx(skb);
1846        dev_put(reg_dev);
1847
1848        return 0;
1849}
1850#endif
1851
1852#ifdef CONFIG_IP_PIMSM_V1
1853/*
1854 * Handle IGMP messages of PIMv1
1855 */
1856
1857int pim_rcv_v1(struct sk_buff * skb)
1858{
1859        struct igmphdr *pim;
1860        struct net *net = dev_net(skb->dev);
1861        struct mr_table *mrt;
1862
1863        if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1864                goto drop;
1865
1866        pim = igmp_hdr(skb);
1867
1868        if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
1869                goto drop;
1870
1871        if (!mrt->mroute_do_pim ||
1872            pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1873                goto drop;
1874
1875        if (__pim_rcv(mrt, skb, sizeof(*pim))) {
1876drop:
1877                kfree_skb(skb);
1878        }
1879        return 0;
1880}
1881#endif
1882
1883#ifdef CONFIG_IP_PIMSM_V2
1884static int pim_rcv(struct sk_buff * skb)
1885{
1886        struct pimreghdr *pim;
1887        struct net *net = dev_net(skb->dev);
1888        struct mr_table *mrt;
1889
1890        if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1891                goto drop;
1892
1893        pim = (struct pimreghdr *)skb_transport_header(skb);
1894        if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1895            (pim->flags&PIM_NULL_REGISTER) ||
1896            (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1897             csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1898                goto drop;
1899
1900        if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
1901                goto drop;
1902
1903        if (__pim_rcv(mrt, skb, sizeof(*pim))) {
1904drop:
1905                kfree_skb(skb);
1906        }
1907        return 0;
1908}
1909#endif
1910
1911static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
1912                              struct mfc_cache *c, struct rtmsg *rtm)
1913{
1914        int ct;
1915        struct rtnexthop *nhp;
1916        u8 *b = skb_tail_pointer(skb);
1917        struct rtattr *mp_head;
1918
1919        /* If cache is unresolved, don't try to parse IIF and OIF */
1920        if (c->mfc_parent >= MAXVIFS)
1921                return -ENOENT;
1922
1923        if (VIF_EXISTS(mrt, c->mfc_parent))
1924                RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex);
1925
1926        mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
1927
1928        for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1929                if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
1930                        if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1931                                goto rtattr_failure;
1932                        nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1933                        nhp->rtnh_flags = 0;
1934                        nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1935                        nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
1936                        nhp->rtnh_len = sizeof(*nhp);
1937                }
1938        }
1939        mp_head->rta_type = RTA_MULTIPATH;
1940        mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1941        rtm->rtm_type = RTN_MULTICAST;
1942        return 1;
1943
1944rtattr_failure:
1945        nlmsg_trim(skb, b);
1946        return -EMSGSIZE;
1947}
1948
1949int ipmr_get_route(struct net *net,
1950                   struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1951{
1952        int err;
1953        struct mr_table *mrt;
1954        struct mfc_cache *cache;
1955        struct rtable *rt = skb_rtable(skb);
1956
1957        mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
1958        if (mrt == NULL)
1959                return -ENOENT;
1960
1961        read_lock(&mrt_lock);
1962        cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst);
1963
1964        if (cache == NULL) {
1965                struct sk_buff *skb2;
1966                struct iphdr *iph;
1967                struct net_device *dev;
1968                int vif;
1969
1970                if (nowait) {
1971                        read_unlock(&mrt_lock);
1972                        return -EAGAIN;
1973                }
1974
1975                dev = skb->dev;
1976                if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) {
1977                        read_unlock(&mrt_lock);
1978                        return -ENODEV;
1979                }
1980                skb2 = skb_clone(skb, GFP_ATOMIC);
1981                if (!skb2) {
1982                        read_unlock(&mrt_lock);
1983                        return -ENOMEM;
1984                }
1985
1986                skb_push(skb2, sizeof(struct iphdr));
1987                skb_reset_network_header(skb2);
1988                iph = ip_hdr(skb2);
1989                iph->ihl = sizeof(struct iphdr) >> 2;
1990                iph->saddr = rt->rt_src;
1991                iph->daddr = rt->rt_dst;
1992                iph->version = 0;
1993                err = ipmr_cache_unresolved(mrt, vif, skb2);
1994                read_unlock(&mrt_lock);
1995                return err;
1996        }
1997
1998        if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1999                cache->mfc_flags |= MFC_NOTIFY;
2000        err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
2001        read_unlock(&mrt_lock);
2002        return err;
2003}
2004
2005static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2006                            u32 pid, u32 seq, struct mfc_cache *c)
2007{
2008        struct nlmsghdr *nlh;
2009        struct rtmsg *rtm;
2010
2011        nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI);
2012        if (nlh == NULL)
2013                return -EMSGSIZE;
2014
2015        rtm = nlmsg_data(nlh);
2016        rtm->rtm_family   = RTNL_FAMILY_IPMR;
2017        rtm->rtm_dst_len  = 32;
2018        rtm->rtm_src_len  = 32;
2019        rtm->rtm_tos      = 0;
2020        rtm->rtm_table    = mrt->id;
2021        NLA_PUT_U32(skb, RTA_TABLE, mrt->id);
2022        rtm->rtm_type     = RTN_MULTICAST;
2023        rtm->rtm_scope    = RT_SCOPE_UNIVERSE;
2024        rtm->rtm_protocol = RTPROT_UNSPEC;
2025        rtm->rtm_flags    = 0;
2026
2027        NLA_PUT_BE32(skb, RTA_SRC, c->mfc_origin);
2028        NLA_PUT_BE32(skb, RTA_DST, c->mfc_mcastgrp);
2029
2030        if (__ipmr_fill_mroute(mrt, skb, c, rtm) < 0)
2031                goto nla_put_failure;
2032
2033        return nlmsg_end(skb, nlh);
2034
2035nla_put_failure:
2036        nlmsg_cancel(skb, nlh);
2037        return -EMSGSIZE;
2038}
2039
2040static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
2041{
2042        struct net *net = sock_net(skb->sk);
2043        struct mr_table *mrt;
2044        struct mfc_cache *mfc;
2045        unsigned int t = 0, s_t;
2046        unsigned int h = 0, s_h;
2047        unsigned int e = 0, s_e;
2048
2049        s_t = cb->args[0];
2050        s_h = cb->args[1];
2051        s_e = cb->args[2];
2052
2053        read_lock(&mrt_lock);
2054        ipmr_for_each_table(mrt, net) {
2055                if (t < s_t)
2056                        goto next_table;
2057                if (t > s_t)
2058                        s_h = 0;
2059                for (h = s_h; h < MFC_LINES; h++) {
2060                        list_for_each_entry(mfc, &mrt->mfc_cache_array[h], list) {
2061                                if (e < s_e)
2062                                        goto next_entry;
2063                                if (ipmr_fill_mroute(mrt, skb,
2064                                                     NETLINK_CB(cb->skb).pid,
2065                                                     cb->nlh->nlmsg_seq,
2066                                                     mfc) < 0)
2067                                        goto done;
2068next_entry:
2069                                e++;
2070                        }
2071                        e = s_e = 0;
2072                }
2073                s_h = 0;
2074next_table:
2075                t++;
2076        }
2077done:
2078        read_unlock(&mrt_lock);
2079
2080        cb->args[2] = e;
2081        cb->args[1] = h;
2082        cb->args[0] = t;
2083
2084        return skb->len;
2085}
2086
2087#ifdef CONFIG_PROC_FS
2088/*
2089 *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
2090 */
2091struct ipmr_vif_iter {
2092        struct seq_net_private p;
2093        struct mr_table *mrt;
2094        int ct;
2095};
2096
2097static struct vif_device *ipmr_vif_seq_idx(struct net *net,
2098                                           struct ipmr_vif_iter *iter,
2099                                           loff_t pos)
2100{
2101        struct mr_table *mrt = iter->mrt;
2102
2103        for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
2104                if (!VIF_EXISTS(mrt, iter->ct))
2105                        continue;
2106                if (pos-- == 0)
2107                        return &mrt->vif_table[iter->ct];
2108        }
2109        return NULL;
2110}
2111
2112static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
2113        __acquires(mrt_lock)
2114{
2115        struct ipmr_vif_iter *iter = seq->private;
2116        struct net *net = seq_file_net(seq);
2117        struct mr_table *mrt;
2118
2119        mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2120        if (mrt == NULL)
2121                return ERR_PTR(-ENOENT);
2122
2123        iter->mrt = mrt;
2124
2125        read_lock(&mrt_lock);
2126        return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1)
2127                : SEQ_START_TOKEN;
2128}
2129
2130static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2131{
2132        struct ipmr_vif_iter *iter = seq->private;
2133        struct net *net = seq_file_net(seq);
2134        struct mr_table *mrt = iter->mrt;
2135
2136        ++*pos;
2137        if (v == SEQ_START_TOKEN)
2138                return ipmr_vif_seq_idx(net, iter, 0);
2139
2140        while (++iter->ct < mrt->maxvif) {
2141                if (!VIF_EXISTS(mrt, iter->ct))
2142                        continue;
2143                return &mrt->vif_table[iter->ct];
2144        }
2145        return NULL;
2146}
2147
2148static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
2149        __releases(mrt_lock)
2150{
2151        read_unlock(&mrt_lock);
2152}
2153
2154static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
2155{
2156        struct ipmr_vif_iter *iter = seq->private;
2157        struct mr_table *mrt = iter->mrt;
2158
2159        if (v == SEQ_START_TOKEN) {
2160                seq_puts(seq,
2161                         "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
2162        } else {
2163                const struct vif_device *vif = v;
2164                const char *name =  vif->dev ? vif->dev->name : "none";
2165
2166                seq_printf(seq,
2167                           "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
2168                           vif - mrt->vif_table,
2169                           name, vif->bytes_in, vif->pkt_in,
2170                           vif->bytes_out, vif->pkt_out,
2171                           vif->flags, vif->local, vif->remote);
2172        }
2173        return 0;
2174}
2175
2176static const struct seq_operations ipmr_vif_seq_ops = {
2177        .start = ipmr_vif_seq_start,
2178        .next  = ipmr_vif_seq_next,
2179        .stop  = ipmr_vif_seq_stop,
2180        .show  = ipmr_vif_seq_show,
2181};
2182
2183static int ipmr_vif_open(struct inode *inode, struct file *file)
2184{
2185        return seq_open_net(inode, file, &ipmr_vif_seq_ops,
2186                            sizeof(struct ipmr_vif_iter));
2187}
2188
2189static const struct file_operations ipmr_vif_fops = {
2190        .owner   = THIS_MODULE,
2191        .open    = ipmr_vif_open,
2192        .read    = seq_read,
2193        .llseek  = seq_lseek,
2194        .release = seq_release_net,
2195};
2196
2197struct ipmr_mfc_iter {
2198        struct seq_net_private p;
2199        struct mr_table *mrt;
2200        struct list_head *cache;
2201        int ct;
2202};
2203
2204
2205static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
2206                                          struct ipmr_mfc_iter *it, loff_t pos)
2207{
2208        struct mr_table *mrt = it->mrt;
2209        struct mfc_cache *mfc;
2210
2211        read_lock(&mrt_lock);
2212        for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
2213                it->cache = &mrt->mfc_cache_array[it->ct];
2214                list_for_each_entry(mfc, it->cache, list)
2215                        if (pos-- == 0)
2216                                return mfc;
2217        }
2218        read_unlock(&mrt_lock);
2219
2220        spin_lock_bh(&mfc_unres_lock);
2221        it->cache = &mrt->mfc_unres_queue;
2222        list_for_each_entry(mfc, it->cache, list)
2223                if (pos-- == 0)
2224                        return mfc;
2225        spin_unlock_bh(&mfc_unres_lock);
2226
2227        it->cache = NULL;
2228        return NULL;
2229}
2230
2231
2232static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
2233{
2234        struct ipmr_mfc_iter *it = seq->private;
2235        struct net *net = seq_file_net(seq);
2236        struct mr_table *mrt;
2237
2238        mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2239        if (mrt == NULL)
2240                return ERR_PTR(-ENOENT);
2241
2242        it->mrt = mrt;
2243        it->cache = NULL;
2244        it->ct = 0;
2245        return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
2246                : SEQ_START_TOKEN;
2247}
2248
2249static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2250{
2251        struct mfc_cache *mfc = v;
2252        struct ipmr_mfc_iter *it = seq->private;
2253        struct net *net = seq_file_net(seq);
2254        struct mr_table *mrt = it->mrt;
2255
2256        ++*pos;
2257
2258        if (v == SEQ_START_TOKEN)
2259                return ipmr_mfc_seq_idx(net, seq->private, 0);
2260
2261        if (mfc->list.next != it->cache)
2262                return list_entry(mfc->list.next, struct mfc_cache, list);
2263
2264        if (it->cache == &mrt->mfc_unres_queue)
2265                goto end_of_list;
2266
2267        BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]);
2268
2269        while (++it->ct < MFC_LINES) {
2270                it->cache = &mrt->mfc_cache_array[it->ct];
2271                if (list_empty(it->cache))
2272                        continue;
2273                return list_first_entry(it->cache, struct mfc_cache, list);
2274        }
2275
2276        /* exhausted cache_array, show unresolved */
2277        read_unlock(&mrt_lock);
2278        it->cache = &mrt->mfc_unres_queue;
2279        it->ct = 0;
2280
2281        spin_lock_bh(&mfc_unres_lock);
2282        if (!list_empty(it->cache))
2283                return list_first_entry(it->cache, struct mfc_cache, list);
2284
2285 end_of_list:
2286        spin_unlock_bh(&mfc_unres_lock);
2287        it->cache = NULL;
2288
2289        return NULL;
2290}
2291
2292static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
2293{
2294        struct ipmr_mfc_iter *it = seq->private;
2295        struct mr_table *mrt = it->mrt;
2296
2297        if (it->cache == &mrt->mfc_unres_queue)
2298                spin_unlock_bh(&mfc_unres_lock);
2299        else if (it->cache == &mrt->mfc_cache_array[it->ct])
2300                read_unlock(&mrt_lock);
2301}
2302
2303static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
2304{
2305        int n;
2306
2307        if (v == SEQ_START_TOKEN) {
2308                seq_puts(seq,
2309                 "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
2310        } else {
2311                const struct mfc_cache *mfc = v;
2312                const struct ipmr_mfc_iter *it = seq->private;
2313                const struct mr_table *mrt = it->mrt;
2314
2315                seq_printf(seq, "%08X %08X %-3hd",
2316                           (__force u32) mfc->mfc_mcastgrp,
2317                           (__force u32) mfc->mfc_origin,
2318                           mfc->mfc_parent);
2319
2320                if (it->cache != &mrt->mfc_unres_queue) {
2321                        seq_printf(seq, " %8lu %8lu %8lu",
2322                                   mfc->mfc_un.res.pkt,
2323                                   mfc->mfc_un.res.bytes,
2324                                   mfc->mfc_un.res.wrong_if);
2325                        for (n = mfc->mfc_un.res.minvif;
2326                             n < mfc->mfc_un.res.maxvif; n++ ) {
2327                                if (VIF_EXISTS(mrt, n) &&
2328                                    mfc->mfc_un.res.ttls[n] < 255)
2329                                        seq_printf(seq,
2330                                           " %2d:%-3d",
2331                                           n, mfc->mfc_un.res.ttls[n]);
2332                        }
2333                } else {
2334                        /* unresolved mfc_caches don't contain
2335                         * pkt, bytes and wrong_if values
2336                         */
2337                        seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
2338                }
2339                seq_putc(seq, '\n');
2340        }
2341        return 0;
2342}
2343
2344static const struct seq_operations ipmr_mfc_seq_ops = {
2345        .start = ipmr_mfc_seq_start,
2346        .next  = ipmr_mfc_seq_next,
2347        .stop  = ipmr_mfc_seq_stop,
2348        .show  = ipmr_mfc_seq_show,
2349};
2350
2351static int ipmr_mfc_open(struct inode *inode, struct file *file)
2352{
2353        return seq_open_net(inode, file, &ipmr_mfc_seq_ops,
2354                            sizeof(struct ipmr_mfc_iter));
2355}
2356
2357static const struct file_operations ipmr_mfc_fops = {
2358        .owner   = THIS_MODULE,
2359        .open    = ipmr_mfc_open,
2360        .read    = seq_read,
2361        .llseek  = seq_lseek,
2362        .release = seq_release_net,
2363};
2364#endif
2365
2366#ifdef CONFIG_IP_PIMSM_V2
2367static const struct net_protocol pim_protocol = {
2368        .handler        =       pim_rcv,
2369        .netns_ok       =       1,
2370};
2371#endif
2372
2373
2374/*
2375 *      Setup for IP multicast routing
2376 */
2377static int __net_init ipmr_net_init(struct net *net)
2378{
2379        int err;
2380
2381        err = ipmr_rules_init(net);
2382        if (err < 0)
2383                goto fail;
2384
2385#ifdef CONFIG_PROC_FS
2386        err = -ENOMEM;
2387        if (!proc_net_fops_create(net, "ip_mr_vif", 0, &ipmr_vif_fops))
2388                goto proc_vif_fail;
2389        if (!proc_net_fops_create(net, "ip_mr_cache", 0, &ipmr_mfc_fops))
2390                goto proc_cache_fail;
2391#endif
2392        return 0;
2393
2394#ifdef CONFIG_PROC_FS
2395proc_cache_fail:
2396        proc_net_remove(net, "ip_mr_vif");
2397proc_vif_fail:
2398        ipmr_rules_exit(net);
2399#endif
2400fail:
2401        return err;
2402}
2403
2404static void __net_exit ipmr_net_exit(struct net *net)
2405{
2406#ifdef CONFIG_PROC_FS
2407        proc_net_remove(net, "ip_mr_cache");
2408        proc_net_remove(net, "ip_mr_vif");
2409#endif
2410        ipmr_rules_exit(net);
2411}
2412
2413static struct pernet_operations ipmr_net_ops = {
2414        .init = ipmr_net_init,
2415        .exit = ipmr_net_exit,
2416};
2417
2418int __init ip_mr_init(void)
2419{
2420        int err;
2421
2422        mrt_cachep = kmem_cache_create("ip_mrt_cache",
2423                                       sizeof(struct mfc_cache),
2424                                       0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2425                                       NULL);
2426        if (!mrt_cachep)
2427                return -ENOMEM;
2428
2429        err = register_pernet_subsys(&ipmr_net_ops);
2430        if (err)
2431                goto reg_pernet_fail;
2432
2433        err = register_netdevice_notifier(&ip_mr_notifier);
2434        if (err)
2435                goto reg_notif_fail;
2436#ifdef CONFIG_IP_PIMSM_V2
2437        if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) {
2438                printk(KERN_ERR "ip_mr_init: can't add PIM protocol\n");
2439                err = -EAGAIN;
2440                goto add_proto_fail;
2441        }
2442#endif
2443        rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, NULL, ipmr_rtm_dumproute);
2444        return 0;
2445
2446#ifdef CONFIG_IP_PIMSM_V2
2447add_proto_fail:
2448        unregister_netdevice_notifier(&ip_mr_notifier);
2449#endif
2450reg_notif_fail:
2451        unregister_pernet_subsys(&ipmr_net_ops);
2452reg_pernet_fail:
2453        kmem_cache_destroy(mrt_cachep);
2454        return err;
2455}
2456
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.