linux/net/ipv4/fib_semantics.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              IPv4 Forwarding Information Base: semantics.
   7 *
   8 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
   9 *
  10 *              This program is free software; you can redistribute it and/or
  11 *              modify it under the terms of the GNU General Public License
  12 *              as published by the Free Software Foundation; either version
  13 *              2 of the License, or (at your option) any later version.
  14 */
  15
  16#include <asm/uaccess.h>
  17#include <asm/system.h>
  18#include <linux/bitops.h>
  19#include <linux/types.h>
  20#include <linux/kernel.h>
  21#include <linux/jiffies.h>
  22#include <linux/mm.h>
  23#include <linux/string.h>
  24#include <linux/socket.h>
  25#include <linux/sockios.h>
  26#include <linux/errno.h>
  27#include <linux/in.h>
  28#include <linux/inet.h>
  29#include <linux/inetdevice.h>
  30#include <linux/netdevice.h>
  31#include <linux/if_arp.h>
  32#include <linux/proc_fs.h>
  33#include <linux/skbuff.h>
  34#include <linux/init.h>
  35
  36#include <net/arp.h>
  37#include <net/ip.h>
  38#include <net/protocol.h>
  39#include <net/route.h>
  40#include <net/tcp.h>
  41#include <net/sock.h>
  42#include <net/ip_fib.h>
  43#include <net/netlink.h>
  44#include <net/nexthop.h>
  45
  46#include "fib_lookup.h"
  47
  48static DEFINE_SPINLOCK(fib_info_lock);
  49static struct hlist_head *fib_info_hash;
  50static struct hlist_head *fib_info_laddrhash;
  51static unsigned int fib_hash_size;
  52static unsigned int fib_info_cnt;
  53
  54#define DEVINDEX_HASHBITS 8
  55#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
  56static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
  57
  58#ifdef CONFIG_IP_ROUTE_MULTIPATH
  59
  60static DEFINE_SPINLOCK(fib_multipath_lock);
  61
  62#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
  63for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
  64
  65#define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
  66for (nhsel=0, nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
  67
  68#else /* CONFIG_IP_ROUTE_MULTIPATH */
  69
  70/* Hope, that gcc will optimize it to get rid of dummy loop */
  71
  72#define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \
  73for (nhsel=0; nhsel < 1; nhsel++)
  74
  75#define change_nexthops(fi) { int nhsel = 0; struct fib_nh * nh = (struct fib_nh *)((fi)->fib_nh); \
  76for (nhsel=0; nhsel < 1; nhsel++)
  77
  78#endif /* CONFIG_IP_ROUTE_MULTIPATH */
  79
  80#define endfor_nexthops(fi) }
  81
  82
  83static const struct
  84{
  85        int     error;
  86        u8      scope;
  87} fib_props[RTN_MAX + 1] = {
  88        {
  89                .error  = 0,
  90                .scope  = RT_SCOPE_NOWHERE,
  91        },      /* RTN_UNSPEC */
  92        {
  93                .error  = 0,
  94                .scope  = RT_SCOPE_UNIVERSE,
  95        },      /* RTN_UNICAST */
  96        {
  97                .error  = 0,
  98                .scope  = RT_SCOPE_HOST,
  99        },      /* RTN_LOCAL */
 100        {
 101                .error  = 0,
 102                .scope  = RT_SCOPE_LINK,
 103        },      /* RTN_BROADCAST */
 104        {
 105                .error  = 0,
 106                .scope  = RT_SCOPE_LINK,
 107        },      /* RTN_ANYCAST */
 108        {
 109                .error  = 0,
 110                .scope  = RT_SCOPE_UNIVERSE,
 111        },      /* RTN_MULTICAST */
 112        {
 113                .error  = -EINVAL,
 114                .scope  = RT_SCOPE_UNIVERSE,
 115        },      /* RTN_BLACKHOLE */
 116        {
 117                .error  = -EHOSTUNREACH,
 118                .scope  = RT_SCOPE_UNIVERSE,
 119        },      /* RTN_UNREACHABLE */
 120        {
 121                .error  = -EACCES,
 122                .scope  = RT_SCOPE_UNIVERSE,
 123        },      /* RTN_PROHIBIT */
 124        {
 125                .error  = -EAGAIN,
 126                .scope  = RT_SCOPE_UNIVERSE,
 127        },      /* RTN_THROW */
 128        {
 129                .error  = -EINVAL,
 130                .scope  = RT_SCOPE_NOWHERE,
 131        },      /* RTN_NAT */
 132        {
 133                .error  = -EINVAL,
 134                .scope  = RT_SCOPE_NOWHERE,
 135        },      /* RTN_XRESOLVE */
 136};
 137
 138
 139/* Release a nexthop info record */
 140
 141void free_fib_info(struct fib_info *fi)
 142{
 143        if (fi->fib_dead == 0) {
 144                printk(KERN_WARNING "Freeing alive fib_info %p\n", fi);
 145                return;
 146        }
 147        change_nexthops(fi) {
 148                if (nh->nh_dev)
 149                        dev_put(nh->nh_dev);
 150                nh->nh_dev = NULL;
 151        } endfor_nexthops(fi);
 152        fib_info_cnt--;
 153        release_net(fi->fib_net);
 154        kfree(fi);
 155}
 156
 157void fib_release_info(struct fib_info *fi)
 158{
 159        spin_lock_bh(&fib_info_lock);
 160        if (fi && --fi->fib_treeref == 0) {
 161                hlist_del(&fi->fib_hash);
 162                if (fi->fib_prefsrc)
 163                        hlist_del(&fi->fib_lhash);
 164                change_nexthops(fi) {
 165                        if (!nh->nh_dev)
 166                                continue;
 167                        hlist_del(&nh->nh_hash);
 168                } endfor_nexthops(fi)
 169                fi->fib_dead = 1;
 170                fib_info_put(fi);
 171        }
 172        spin_unlock_bh(&fib_info_lock);
 173}
 174
 175static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
 176{
 177        const struct fib_nh *onh = ofi->fib_nh;
 178
 179        for_nexthops(fi) {
 180                if (nh->nh_oif != onh->nh_oif ||
 181                    nh->nh_gw  != onh->nh_gw ||
 182                    nh->nh_scope != onh->nh_scope ||
 183#ifdef CONFIG_IP_ROUTE_MULTIPATH
 184                    nh->nh_weight != onh->nh_weight ||
 185#endif
 186#ifdef CONFIG_NET_CLS_ROUTE
 187                    nh->nh_tclassid != onh->nh_tclassid ||
 188#endif
 189                    ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
 190                        return -1;
 191                onh++;
 192        } endfor_nexthops(fi);
 193        return 0;
 194}
 195
 196static inline unsigned int fib_devindex_hashfn(unsigned int val)
 197{
 198        unsigned int mask = DEVINDEX_HASHSIZE - 1;
 199
 200        return (val ^
 201                (val >> DEVINDEX_HASHBITS) ^
 202                (val >> (DEVINDEX_HASHBITS * 2))) & mask;
 203}
 204
 205static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
 206{
 207        unsigned int mask = (fib_hash_size - 1);
 208        unsigned int val = fi->fib_nhs;
 209
 210        val ^= fi->fib_protocol;
 211        val ^= (__force u32)fi->fib_prefsrc;
 212        val ^= fi->fib_priority;
 213        for_nexthops(fi) {
 214                val ^= fib_devindex_hashfn(nh->nh_oif);
 215        } endfor_nexthops(fi)
 216
 217        return (val ^ (val >> 7) ^ (val >> 12)) & mask;
 218}
 219
 220static struct fib_info *fib_find_info(const struct fib_info *nfi)
 221{
 222        struct hlist_head *head;
 223        struct hlist_node *node;
 224        struct fib_info *fi;
 225        unsigned int hash;
 226
 227        hash = fib_info_hashfn(nfi);
 228        head = &fib_info_hash[hash];
 229
 230        hlist_for_each_entry(fi, node, head, fib_hash) {
 231                if (fi->fib_net != nfi->fib_net)
 232                        continue;
 233                if (fi->fib_nhs != nfi->fib_nhs)
 234                        continue;
 235                if (nfi->fib_protocol == fi->fib_protocol &&
 236                    nfi->fib_prefsrc == fi->fib_prefsrc &&
 237                    nfi->fib_priority == fi->fib_priority &&
 238                    memcmp(nfi->fib_metrics, fi->fib_metrics,
 239                           sizeof(fi->fib_metrics)) == 0 &&
 240                    ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
 241                    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
 242                        return fi;
 243        }
 244
 245        return NULL;
 246}
 247
 248/* Check, that the gateway is already configured.
 249   Used only by redirect accept routine.
 250 */
 251
 252int ip_fib_check_default(__be32 gw, struct net_device *dev)
 253{
 254        struct hlist_head *head;
 255        struct hlist_node *node;
 256        struct fib_nh *nh;
 257        unsigned int hash;
 258
 259        spin_lock(&fib_info_lock);
 260
 261        hash = fib_devindex_hashfn(dev->ifindex);
 262        head = &fib_info_devhash[hash];
 263        hlist_for_each_entry(nh, node, head, nh_hash) {
 264                if (nh->nh_dev == dev &&
 265                    nh->nh_gw == gw &&
 266                    !(nh->nh_flags&RTNH_F_DEAD)) {
 267                        spin_unlock(&fib_info_lock);
 268                        return 0;
 269                }
 270        }
 271
 272        spin_unlock(&fib_info_lock);
 273
 274        return -1;
 275}
 276
 277static inline size_t fib_nlmsg_size(struct fib_info *fi)
 278{
 279        size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
 280                         + nla_total_size(4) /* RTA_TABLE */
 281                         + nla_total_size(4) /* RTA_DST */
 282                         + nla_total_size(4) /* RTA_PRIORITY */
 283                         + nla_total_size(4); /* RTA_PREFSRC */
 284
 285        /* space for nested metrics */
 286        payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
 287
 288        if (fi->fib_nhs) {
 289                /* Also handles the special case fib_nhs == 1 */
 290
 291                /* each nexthop is packed in an attribute */
 292                size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
 293
 294                /* may contain flow and gateway attribute */
 295                nhsize += 2 * nla_total_size(4);
 296
 297                /* all nexthops are packed in a nested attribute */
 298                payload += nla_total_size(fi->fib_nhs * nhsize);
 299        }
 300
 301        return payload;
 302}
 303
 304void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
 305               int dst_len, u32 tb_id, struct nl_info *info,
 306               unsigned int nlm_flags)
 307{
 308        struct sk_buff *skb;
 309        u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
 310        int err = -ENOBUFS;
 311
 312        skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
 313        if (skb == NULL)
 314                goto errout;
 315
 316        err = fib_dump_info(skb, info->pid, seq, event, tb_id,
 317                            fa->fa_type, fa->fa_scope, key, dst_len,
 318                            fa->fa_tos, fa->fa_info, nlm_flags);
 319        if (err < 0) {
 320                /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
 321                WARN_ON(err == -EMSGSIZE);
 322                kfree_skb(skb);
 323                goto errout;
 324        }
 325        rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
 326                    info->nlh, GFP_KERNEL);
 327        return;
 328errout:
 329        if (err < 0)
 330                rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
 331}
 332
 333/* Return the first fib alias matching TOS with
 334 * priority less than or equal to PRIO.
 335 */
 336struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
 337{
 338        if (fah) {
 339                struct fib_alias *fa;
 340                list_for_each_entry(fa, fah, fa_list) {
 341                        if (fa->fa_tos > tos)
 342                                continue;
 343                        if (fa->fa_info->fib_priority >= prio ||
 344                            fa->fa_tos < tos)
 345                                return fa;
 346                }
 347        }
 348        return NULL;
 349}
 350
 351int fib_detect_death(struct fib_info *fi, int order,
 352                     struct fib_info **last_resort, int *last_idx, int dflt)
 353{
 354        struct neighbour *n;
 355        int state = NUD_NONE;
 356
 357        n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
 358        if (n) {
 359                state = n->nud_state;
 360                neigh_release(n);
 361        }
 362        if (state == NUD_REACHABLE)
 363                return 0;
 364        if ((state&NUD_VALID) && order != dflt)
 365                return 0;
 366        if ((state&NUD_VALID) ||
 367            (*last_idx<0 && order > dflt)) {
 368                *last_resort = fi;
 369                *last_idx = order;
 370        }
 371        return 1;
 372}
 373
 374#ifdef CONFIG_IP_ROUTE_MULTIPATH
 375
 376static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
 377{
 378        int nhs = 0;
 379
 380        while (rtnh_ok(rtnh, remaining)) {
 381                nhs++;
 382                rtnh = rtnh_next(rtnh, &remaining);
 383        }
 384
 385        /* leftover implies invalid nexthop configuration, discard it */
 386        return remaining > 0 ? 0 : nhs;
 387}
 388
 389static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 390                       int remaining, struct fib_config *cfg)
 391{
 392        change_nexthops(fi) {
 393                int attrlen;
 394
 395                if (!rtnh_ok(rtnh, remaining))
 396                        return -EINVAL;
 397
 398                nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
 399                nh->nh_oif = rtnh->rtnh_ifindex;
 400                nh->nh_weight = rtnh->rtnh_hops + 1;
 401
 402                attrlen = rtnh_attrlen(rtnh);
 403                if (attrlen > 0) {
 404                        struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
 405
 406                        nla = nla_find(attrs, attrlen, RTA_GATEWAY);
 407                        nh->nh_gw = nla ? nla_get_be32(nla) : 0;
 408#ifdef CONFIG_NET_CLS_ROUTE
 409                        nla = nla_find(attrs, attrlen, RTA_FLOW);
 410                        nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
 411#endif
 412                }
 413
 414                rtnh = rtnh_next(rtnh, &remaining);
 415        } endfor_nexthops(fi);
 416
 417        return 0;
 418}
 419
 420#endif
 421
 422int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
 423{
 424#ifdef CONFIG_IP_ROUTE_MULTIPATH
 425        struct rtnexthop *rtnh;
 426        int remaining;
 427#endif
 428
 429        if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
 430                return 1;
 431
 432        if (cfg->fc_oif || cfg->fc_gw) {
 433                if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
 434                    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
 435                        return 0;
 436                return 1;
 437        }
 438
 439#ifdef CONFIG_IP_ROUTE_MULTIPATH
 440        if (cfg->fc_mp == NULL)
 441                return 0;
 442
 443        rtnh = cfg->fc_mp;
 444        remaining = cfg->fc_mp_len;
 445
 446        for_nexthops(fi) {
 447                int attrlen;
 448
 449                if (!rtnh_ok(rtnh, remaining))
 450                        return -EINVAL;
 451
 452                if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
 453                        return 1;
 454
 455                attrlen = rtnh_attrlen(rtnh);
 456                if (attrlen < 0) {
 457                        struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
 458
 459                        nla = nla_find(attrs, attrlen, RTA_GATEWAY);
 460                        if (nla && nla_get_be32(nla) != nh->nh_gw)
 461                                return 1;
 462#ifdef CONFIG_NET_CLS_ROUTE
 463                        nla = nla_find(attrs, attrlen, RTA_FLOW);
 464                        if (nla && nla_get_u32(nla) != nh->nh_tclassid)
 465                                return 1;
 466#endif
 467                }
 468
 469                rtnh = rtnh_next(rtnh, &remaining);
 470        } endfor_nexthops(fi);
 471#endif
 472        return 0;
 473}
 474
 475
 476/*
 477   Picture
 478   -------
 479
 480   Semantics of nexthop is very messy by historical reasons.
 481   We have to take into account, that:
 482   a) gateway can be actually local interface address,
 483      so that gatewayed route is direct.
 484   b) gateway must be on-link address, possibly
 485      described not by an ifaddr, but also by a direct route.
 486   c) If both gateway and interface are specified, they should not
 487      contradict.
 488   d) If we use tunnel routes, gateway could be not on-link.
 489
 490   Attempt to reconcile all of these (alas, self-contradictory) conditions
 491   results in pretty ugly and hairy code with obscure logic.
 492
 493   I chose to generalized it instead, so that the size
 494   of code does not increase practically, but it becomes
 495   much more general.
 496   Every prefix is assigned a "scope" value: "host" is local address,
 497   "link" is direct route,
 498   [ ... "site" ... "interior" ... ]
 499   and "universe" is true gateway route with global meaning.
 500
 501   Every prefix refers to a set of "nexthop"s (gw, oif),
 502   where gw must have narrower scope. This recursion stops
 503   when gw has LOCAL scope or if "nexthop" is declared ONLINK,
 504   which means that gw is forced to be on link.
 505
 506   Code is still hairy, but now it is apparently logically
 507   consistent and very flexible. F.e. as by-product it allows
 508   to co-exists in peace independent exterior and interior
 509   routing processes.
 510
 511   Normally it looks as following.
 512
 513   {universe prefix}  -> (gw, oif) [scope link]
 514                          |
 515                          |-> {link prefix} -> (gw, oif) [scope local]
 516                                                |
 517                                                |-> {local prefix} (terminal node)
 518 */
 519
 520static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
 521                        struct fib_nh *nh)
 522{
 523        int err;
 524        struct net *net;
 525
 526        net = cfg->fc_nlinfo.nl_net;
 527        if (nh->nh_gw) {
 528                struct fib_result res;
 529
 530#ifdef CONFIG_IP_ROUTE_PERVASIVE
 531                if (nh->nh_flags&RTNH_F_PERVASIVE)
 532                        return 0;
 533#endif
 534                if (nh->nh_flags&RTNH_F_ONLINK) {
 535                        struct net_device *dev;
 536
 537                        if (cfg->fc_scope >= RT_SCOPE_LINK)
 538                                return -EINVAL;
 539                        if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
 540                                return -EINVAL;
 541                        if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
 542                                return -ENODEV;
 543                        if (!(dev->flags&IFF_UP))
 544                                return -ENETDOWN;
 545                        nh->nh_dev = dev;
 546                        dev_hold(dev);
 547                        nh->nh_scope = RT_SCOPE_LINK;
 548                        return 0;
 549                }
 550                {
 551                        struct flowi fl = {
 552                                .nl_u = {
 553                                        .ip4_u = {
 554                                                .daddr = nh->nh_gw,
 555                                                .scope = cfg->fc_scope + 1,
 556                                        },
 557                                },
 558                                .oif = nh->nh_oif,
 559                        };
 560
 561                        /* It is not necessary, but requires a bit of thinking */
 562                        if (fl.fl4_scope < RT_SCOPE_LINK)
 563                                fl.fl4_scope = RT_SCOPE_LINK;
 564                        if ((err = fib_lookup(net, &fl, &res)) != 0)
 565                                return err;
 566                }
 567                err = -EINVAL;
 568                if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
 569                        goto out;
 570                nh->nh_scope = res.scope;
 571                nh->nh_oif = FIB_RES_OIF(res);
 572                if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
 573                        goto out;
 574                dev_hold(nh->nh_dev);
 575                err = -ENETDOWN;
 576                if (!(nh->nh_dev->flags & IFF_UP))
 577                        goto out;
 578                err = 0;
 579out:
 580                fib_res_put(&res);
 581                return err;
 582        } else {
 583                struct in_device *in_dev;
 584
 585                if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
 586                        return -EINVAL;
 587
 588                in_dev = inetdev_by_index(net, nh->nh_oif);
 589                if (in_dev == NULL)
 590                        return -ENODEV;
 591                if (!(in_dev->dev->flags&IFF_UP)) {
 592                        in_dev_put(in_dev);
 593                        return -ENETDOWN;
 594                }
 595                nh->nh_dev = in_dev->dev;
 596                dev_hold(nh->nh_dev);
 597                nh->nh_scope = RT_SCOPE_HOST;
 598                in_dev_put(in_dev);
 599        }
 600        return 0;
 601}
 602
 603static inline unsigned int fib_laddr_hashfn(__be32 val)
 604{
 605        unsigned int mask = (fib_hash_size - 1);
 606
 607        return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
 608}
 609
 610static struct hlist_head *fib_hash_alloc(int bytes)
 611{
 612        if (bytes <= PAGE_SIZE)
 613                return kzalloc(bytes, GFP_KERNEL);
 614        else
 615                return (struct hlist_head *)
 616                        __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes));
 617}
 618
 619static void fib_hash_free(struct hlist_head *hash, int bytes)
 620{
 621        if (!hash)
 622                return;
 623
 624        if (bytes <= PAGE_SIZE)
 625                kfree(hash);
 626        else
 627                free_pages((unsigned long) hash, get_order(bytes));
 628}
 629
 630static void fib_hash_move(struct hlist_head *new_info_hash,
 631                          struct hlist_head *new_laddrhash,
 632                          unsigned int new_size)
 633{
 634        struct hlist_head *old_info_hash, *old_laddrhash;
 635        unsigned int old_size = fib_hash_size;
 636        unsigned int i, bytes;
 637
 638        spin_lock_bh(&fib_info_lock);
 639        old_info_hash = fib_info_hash;
 640        old_laddrhash = fib_info_laddrhash;
 641        fib_hash_size = new_size;
 642
 643        for (i = 0; i < old_size; i++) {
 644                struct hlist_head *head = &fib_info_hash[i];
 645                struct hlist_node *node, *n;
 646                struct fib_info *fi;
 647
 648                hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
 649                        struct hlist_head *dest;
 650                        unsigned int new_hash;
 651
 652                        hlist_del(&fi->fib_hash);
 653
 654                        new_hash = fib_info_hashfn(fi);
 655                        dest = &new_info_hash[new_hash];
 656                        hlist_add_head(&fi->fib_hash, dest);
 657                }
 658        }
 659        fib_info_hash = new_info_hash;
 660
 661        for (i = 0; i < old_size; i++) {
 662                struct hlist_head *lhead = &fib_info_laddrhash[i];
 663                struct hlist_node *node, *n;
 664                struct fib_info *fi;
 665
 666                hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
 667                        struct hlist_head *ldest;
 668                        unsigned int new_hash;
 669
 670                        hlist_del(&fi->fib_lhash);
 671
 672                        new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
 673                        ldest = &new_laddrhash[new_hash];
 674                        hlist_add_head(&fi->fib_lhash, ldest);
 675                }
 676        }
 677        fib_info_laddrhash = new_laddrhash;
 678
 679        spin_unlock_bh(&fib_info_lock);
 680
 681        bytes = old_size * sizeof(struct hlist_head *);
 682        fib_hash_free(old_info_hash, bytes);
 683        fib_hash_free(old_laddrhash, bytes);
 684}
 685
 686struct fib_info *fib_create_info(struct fib_config *cfg)
 687{
 688        int err;
 689        struct fib_info *fi = NULL;
 690        struct fib_info *ofi;
 691        int nhs = 1;
 692        struct net *net = cfg->fc_nlinfo.nl_net;
 693
 694        /* Fast check to catch the most weird cases */
 695        if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
 696                goto err_inval;
 697
 698#ifdef CONFIG_IP_ROUTE_MULTIPATH
 699        if (cfg->fc_mp) {
 700                nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
 701                if (nhs == 0)
 702                        goto err_inval;
 703        }
 704#endif
 705
 706        err = -ENOBUFS;
 707        if (fib_info_cnt >= fib_hash_size) {
 708                unsigned int new_size = fib_hash_size << 1;
 709                struct hlist_head *new_info_hash;
 710                struct hlist_head *new_laddrhash;
 711                unsigned int bytes;
 712
 713                if (!new_size)
 714                        new_size = 1;
 715                bytes = new_size * sizeof(struct hlist_head *);
 716                new_info_hash = fib_hash_alloc(bytes);
 717                new_laddrhash = fib_hash_alloc(bytes);
 718                if (!new_info_hash || !new_laddrhash) {
 719                        fib_hash_free(new_info_hash, bytes);
 720                        fib_hash_free(new_laddrhash, bytes);
 721                } else
 722                        fib_hash_move(new_info_hash, new_laddrhash, new_size);
 723
 724                if (!fib_hash_size)
 725                        goto failure;
 726        }
 727
 728        fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
 729        if (fi == NULL)
 730                goto failure;
 731        fib_info_cnt++;
 732
 733        fi->fib_net = hold_net(net);
 734        fi->fib_protocol = cfg->fc_protocol;
 735        fi->fib_flags = cfg->fc_flags;
 736        fi->fib_priority = cfg->fc_priority;
 737        fi->fib_prefsrc = cfg->fc_prefsrc;
 738
 739        fi->fib_nhs = nhs;
 740        change_nexthops(fi) {
 741                nh->nh_parent = fi;
 742        } endfor_nexthops(fi)
 743
 744        if (cfg->fc_mx) {
 745                struct nlattr *nla;
 746                int remaining;
 747
 748                nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
 749                        int type = nla_type(nla);
 750
 751                        if (type) {
 752                                if (type > RTAX_MAX)
 753                                        goto err_inval;
 754                                fi->fib_metrics[type - 1] = nla_get_u32(nla);
 755                        }
 756                }
 757        }
 758
 759        if (cfg->fc_mp) {
 760#ifdef CONFIG_IP_ROUTE_MULTIPATH
 761                err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
 762                if (err != 0)
 763                        goto failure;
 764                if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
 765                        goto err_inval;
 766                if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
 767                        goto err_inval;
 768#ifdef CONFIG_NET_CLS_ROUTE
 769                if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
 770                        goto err_inval;
 771#endif
 772#else
 773                goto err_inval;
 774#endif
 775        } else {
 776                struct fib_nh *nh = fi->fib_nh;
 777
 778                nh->nh_oif = cfg->fc_oif;
 779                nh->nh_gw = cfg->fc_gw;
 780                nh->nh_flags = cfg->fc_flags;
 781#ifdef CONFIG_NET_CLS_ROUTE
 782                nh->nh_tclassid = cfg->fc_flow;
 783#endif
 784#ifdef CONFIG_IP_ROUTE_MULTIPATH
 785                nh->nh_weight = 1;
 786#endif
 787        }
 788
 789        if (fib_props[cfg->fc_type].error) {
 790                if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
 791                        goto err_inval;
 792                goto link_it;
 793        }
 794
 795        if (cfg->fc_scope > RT_SCOPE_HOST)
 796                goto err_inval;
 797
 798        if (cfg->fc_scope == RT_SCOPE_HOST) {
 799                struct fib_nh *nh = fi->fib_nh;
 800
 801                /* Local address is added. */
 802                if (nhs != 1 || nh->nh_gw)
 803                        goto err_inval;
 804                nh->nh_scope = RT_SCOPE_NOWHERE;
 805                nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
 806                err = -ENODEV;
 807                if (nh->nh_dev == NULL)
 808                        goto failure;
 809        } else {
 810                change_nexthops(fi) {
 811                        if ((err = fib_check_nh(cfg, fi, nh)) != 0)
 812                                goto failure;
 813                } endfor_nexthops(fi)
 814        }
 815
 816        if (fi->fib_prefsrc) {
 817                if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
 818                    fi->fib_prefsrc != cfg->fc_dst)
 819                        if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
 820                                goto err_inval;
 821        }
 822
 823link_it:
 824        if ((ofi = fib_find_info(fi)) != NULL) {
 825                fi->fib_dead = 1;
 826                free_fib_info(fi);
 827                ofi->fib_treeref++;
 828                return ofi;
 829        }
 830
 831        fi->fib_treeref++;
 832        atomic_inc(&fi->fib_clntref);
 833        spin_lock_bh(&fib_info_lock);
 834        hlist_add_head(&fi->fib_hash,
 835                       &fib_info_hash[fib_info_hashfn(fi)]);
 836        if (fi->fib_prefsrc) {
 837                struct hlist_head *head;
 838
 839                head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
 840                hlist_add_head(&fi->fib_lhash, head);
 841        }
 842        change_nexthops(fi) {
 843                struct hlist_head *head;
 844                unsigned int hash;
 845
 846                if (!nh->nh_dev)
 847                        continue;
 848                hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
 849                head = &fib_info_devhash[hash];
 850                hlist_add_head(&nh->nh_hash, head);
 851        } endfor_nexthops(fi)
 852        spin_unlock_bh(&fib_info_lock);
 853        return fi;
 854
 855err_inval:
 856        err = -EINVAL;
 857
 858failure:
 859        if (fi) {
 860                fi->fib_dead = 1;
 861                free_fib_info(fi);
 862        }
 863
 864        return ERR_PTR(err);
 865}
 866
 867/* Note! fib_semantic_match intentionally uses  RCU list functions. */
 868int fib_semantic_match(struct list_head *head, const struct flowi *flp,
 869                       struct fib_result *res, __be32 zone, __be32 mask,
 870                        int prefixlen)
 871{
 872        struct fib_alias *fa;
 873        int nh_sel = 0;
 874
 875        list_for_each_entry_rcu(fa, head, fa_list) {
 876                int err;
 877
 878                if (fa->fa_tos &&
 879                    fa->fa_tos != flp->fl4_tos)
 880                        continue;
 881
 882                if (fa->fa_scope < flp->fl4_scope)
 883                        continue;
 884
 885                fa->fa_state |= FA_S_ACCESSED;
 886
 887                err = fib_props[fa->fa_type].error;
 888                if (err == 0) {
 889                        struct fib_info *fi = fa->fa_info;
 890
 891                        if (fi->fib_flags & RTNH_F_DEAD)
 892                                continue;
 893
 894                        switch (fa->fa_type) {
 895                        case RTN_UNICAST:
 896                        case RTN_LOCAL:
 897                        case RTN_BROADCAST:
 898                        case RTN_ANYCAST:
 899                        case RTN_MULTICAST:
 900                                for_nexthops(fi) {
 901                                        if (nh->nh_flags&RTNH_F_DEAD)
 902                                                continue;
 903                                        if (!flp->oif || flp->oif == nh->nh_oif)
 904                                                break;
 905                                }
 906#ifdef CONFIG_IP_ROUTE_MULTIPATH
 907                                if (nhsel < fi->fib_nhs) {
 908                                        nh_sel = nhsel;
 909                                        goto out_fill_res;
 910                                }
 911#else
 912                                if (nhsel < 1) {
 913                                        goto out_fill_res;
 914                                }
 915#endif
 916                                endfor_nexthops(fi);
 917                                continue;
 918
 919                        default:
 920                                printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
 921                                        fa->fa_type);
 922                                return -EINVAL;
 923                        }
 924                }
 925                return err;
 926        }
 927        return 1;
 928
 929out_fill_res:
 930        res->prefixlen = prefixlen;
 931        res->nh_sel = nh_sel;
 932        res->type = fa->fa_type;
 933        res->scope = fa->fa_scope;
 934        res->fi = fa->fa_info;
 935        atomic_inc(&res->fi->fib_clntref);
 936        return 0;
 937}
 938
 939/* Find appropriate source address to this destination */
 940
 941__be32 __fib_res_prefsrc(struct fib_result *res)
 942{
 943        return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
 944}
 945
 946int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
 947                  u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
 948                  struct fib_info *fi, unsigned int flags)
 949{
 950        struct nlmsghdr *nlh;
 951        struct rtmsg *rtm;
 952
 953        nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
 954        if (nlh == NULL)
 955                return -EMSGSIZE;
 956
 957        rtm = nlmsg_data(nlh);
 958        rtm->rtm_family = AF_INET;
 959        rtm->rtm_dst_len = dst_len;
 960        rtm->rtm_src_len = 0;
 961        rtm->rtm_tos = tos;
 962        if (tb_id < 256)
 963                rtm->rtm_table = tb_id;
 964        else
 965                rtm->rtm_table = RT_TABLE_COMPAT;
 966        NLA_PUT_U32(skb, RTA_TABLE, tb_id);
 967        rtm->rtm_type = type;
 968        rtm->rtm_flags = fi->fib_flags;
 969        rtm->rtm_scope = scope;
 970        rtm->rtm_protocol = fi->fib_protocol;
 971
 972        if (rtm->rtm_dst_len)
 973                NLA_PUT_BE32(skb, RTA_DST, dst);
 974
 975        if (fi->fib_priority)
 976                NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
 977
 978        if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
 979                goto nla_put_failure;
 980
 981        if (fi->fib_prefsrc)
 982                NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
 983
 984        if (fi->fib_nhs == 1) {
 985                if (fi->fib_nh->nh_gw)
 986                        NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
 987
 988                if (fi->fib_nh->nh_oif)
 989                        NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
 990#ifdef CONFIG_NET_CLS_ROUTE
 991                if (fi->fib_nh[0].nh_tclassid)
 992                        NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
 993#endif
 994        }
 995#ifdef CONFIG_IP_ROUTE_MULTIPATH
 996        if (fi->fib_nhs > 1) {
 997                struct rtnexthop *rtnh;
 998                struct nlattr *mp;
 999
1000                mp = nla_nest_start(skb, RTA_MULTIPATH);
1001                if (mp == NULL)
1002                        goto nla_put_failure;
1003
1004                for_nexthops(fi) {
1005                        rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1006                        if (rtnh == NULL)
1007                                goto nla_put_failure;
1008
1009                        rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1010                        rtnh->rtnh_hops = nh->nh_weight - 1;
1011                        rtnh->rtnh_ifindex = nh->nh_oif;
1012
1013                        if (nh->nh_gw)
1014                                NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1015#ifdef CONFIG_NET_CLS_ROUTE
1016                        if (nh->nh_tclassid)
1017                                NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1018#endif
1019                        /* length of rtnetlink header + attributes */
1020                        rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1021                } endfor_nexthops(fi);
1022
1023                nla_nest_end(skb, mp);
1024        }
1025#endif
1026        return nlmsg_end(skb, nlh);
1027
1028nla_put_failure:
1029        nlmsg_cancel(skb, nlh);
1030        return -EMSGSIZE;
1031}
1032
1033/*
1034   Update FIB if:
1035   - local address disappeared -> we must delete all the entries
1036     referring to it.
1037   - device went down -> we must shutdown all nexthops going via it.
1038 */
1039int fib_sync_down_addr(struct net *net, __be32 local)
1040{
1041        int ret = 0;
1042        unsigned int hash = fib_laddr_hashfn(local);
1043        struct hlist_head *head = &fib_info_laddrhash[hash];
1044        struct hlist_node *node;
1045        struct fib_info *fi;
1046
1047        if (fib_info_laddrhash == NULL || local == 0)
1048                return 0;
1049
1050        hlist_for_each_entry(fi, node, head, fib_lhash) {
1051                if (fi->fib_net != net)
1052                        continue;
1053                if (fi->fib_prefsrc == local) {
1054                        fi->fib_flags |= RTNH_F_DEAD;
1055                        ret++;
1056                }
1057        }
1058        return ret;
1059}
1060
1061int fib_sync_down_dev(struct net_device *dev, int force)
1062{
1063        int ret = 0;
1064        int scope = RT_SCOPE_NOWHERE;
1065        struct fib_info *prev_fi = NULL;
1066        unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1067        struct hlist_head *head = &fib_info_devhash[hash];
1068        struct hlist_node *node;
1069        struct fib_nh *nh;
1070
1071        if (force)
1072                scope = -1;
1073
1074        hlist_for_each_entry(nh, node, head, nh_hash) {
1075                struct fib_info *fi = nh->nh_parent;
1076                int dead;
1077
1078                BUG_ON(!fi->fib_nhs);
1079                if (nh->nh_dev != dev || fi == prev_fi)
1080                        continue;
1081                prev_fi = fi;
1082                dead = 0;
1083                change_nexthops(fi) {
1084                        if (nh->nh_flags&RTNH_F_DEAD)
1085                                dead++;
1086                        else if (nh->nh_dev == dev &&
1087                                        nh->nh_scope != scope) {
1088                                nh->nh_flags |= RTNH_F_DEAD;
1089#ifdef CONFIG_IP_ROUTE_MULTIPATH
1090                                spin_lock_bh(&fib_multipath_lock);
1091                                fi->fib_power -= nh->nh_power;
1092                                nh->nh_power = 0;
1093                                spin_unlock_bh(&fib_multipath_lock);
1094#endif
1095                                dead++;
1096                        }
1097#ifdef CONFIG_IP_ROUTE_MULTIPATH
1098                        if (force > 1 && nh->nh_dev == dev) {
1099                                dead = fi->fib_nhs;
1100                                break;
1101                        }
1102#endif
1103                } endfor_nexthops(fi)
1104                if (dead == fi->fib_nhs) {
1105                        fi->fib_flags |= RTNH_F_DEAD;
1106                        ret++;
1107                }
1108        }
1109
1110        return ret;
1111}
1112
1113#ifdef CONFIG_IP_ROUTE_MULTIPATH
1114
1115/*
1116   Dead device goes up. We wake up dead nexthops.
1117   It takes sense only on multipath routes.
1118 */
1119
1120int fib_sync_up(struct net_device *dev)
1121{
1122        struct fib_info *prev_fi;
1123        unsigned int hash;
1124        struct hlist_head *head;
1125        struct hlist_node *node;
1126        struct fib_nh *nh;
1127        int ret;
1128
1129        if (!(dev->flags&IFF_UP))
1130                return 0;
1131
1132        prev_fi = NULL;
1133        hash = fib_devindex_hashfn(dev->ifindex);
1134        head = &fib_info_devhash[hash];
1135        ret = 0;
1136
1137        hlist_for_each_entry(nh, node, head, nh_hash) {
1138                struct fib_info *fi = nh->nh_parent;
1139                int alive;
1140
1141                BUG_ON(!fi->fib_nhs);
1142                if (nh->nh_dev != dev || fi == prev_fi)
1143                        continue;
1144
1145                prev_fi = fi;
1146                alive = 0;
1147                change_nexthops(fi) {
1148                        if (!(nh->nh_flags&RTNH_F_DEAD)) {
1149                                alive++;
1150                                continue;
1151                        }
1152                        if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1153                                continue;
1154                        if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
1155                                continue;
1156                        alive++;
1157                        spin_lock_bh(&fib_multipath_lock);
1158                        nh->nh_power = 0;
1159                        nh->nh_flags &= ~RTNH_F_DEAD;
1160                        spin_unlock_bh(&fib_multipath_lock);
1161                } endfor_nexthops(fi)
1162
1163                if (alive > 0) {
1164                        fi->fib_flags &= ~RTNH_F_DEAD;
1165                        ret++;
1166                }
1167        }
1168
1169        return ret;
1170}
1171
1172/*
1173   The algorithm is suboptimal, but it provides really
1174   fair weighted route distribution.
1175 */
1176
1177void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1178{
1179        struct fib_info *fi = res->fi;
1180        int w;
1181
1182        spin_lock_bh(&fib_multipath_lock);
1183        if (fi->fib_power <= 0) {
1184                int power = 0;
1185                change_nexthops(fi) {
1186                        if (!(nh->nh_flags&RTNH_F_DEAD)) {
1187                                power += nh->nh_weight;
1188                                nh->nh_power = nh->nh_weight;
1189                        }
1190                } endfor_nexthops(fi);
1191                fi->fib_power = power;
1192                if (power <= 0) {
1193                        spin_unlock_bh(&fib_multipath_lock);
1194                        /* Race condition: route has just become dead. */
1195                        res->nh_sel = 0;
1196                        return;
1197                }
1198        }
1199
1200
1201        /* w should be random number [0..fi->fib_power-1],
1202           it is pretty bad approximation.
1203         */
1204
1205        w = jiffies % fi->fib_power;
1206
1207        change_nexthops(fi) {
1208                if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1209                        if ((w -= nh->nh_power) <= 0) {
1210                                nh->nh_power--;
1211                                fi->fib_power--;
1212                                res->nh_sel = nhsel;
1213                                spin_unlock_bh(&fib_multipath_lock);
1214                                return;
1215                        }
1216                }
1217        } endfor_nexthops(fi);
1218
1219        /* Race condition: route has just become dead. */
1220        res->nh_sel = 0;
1221        spin_unlock_bh(&fib_multipath_lock);
1222}
1223#endif
1224