linux/net/ipv4/fib_semantics.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              IPv4 Forwarding Information Base: semantics.
   7 *
   8 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
   9 *
  10 *              This program is free software; you can redistribute it and/or
  11 *              modify it under the terms of the GNU General Public License
  12 *              as published by the Free Software Foundation; either version
  13 *              2 of the License, or (at your option) any later version.
  14 */
  15
  16#include <asm/uaccess.h>
  17#include <asm/system.h>
  18#include <linux/bitops.h>
  19#include <linux/types.h>
  20#include <linux/kernel.h>
  21#include <linux/jiffies.h>
  22#include <linux/mm.h>
  23#include <linux/string.h>
  24#include <linux/socket.h>
  25#include <linux/sockios.h>
  26#include <linux/errno.h>
  27#include <linux/in.h>
  28#include <linux/inet.h>
  29#include <linux/inetdevice.h>
  30#include <linux/netdevice.h>
  31#include <linux/if_arp.h>
  32#include <linux/proc_fs.h>
  33#include <linux/skbuff.h>
  34#include <linux/init.h>
  35#include <linux/slab.h>
  36
  37#include <net/arp.h>
  38#include <net/ip.h>
  39#include <net/protocol.h>
  40#include <net/route.h>
  41#include <net/tcp.h>
  42#include <net/sock.h>
  43#include <net/ip_fib.h>
  44#include <net/netlink.h>
  45#include <net/nexthop.h>
  46
  47#include "fib_lookup.h"
  48
  49static DEFINE_SPINLOCK(fib_info_lock);
  50static struct hlist_head *fib_info_hash;
  51static struct hlist_head *fib_info_laddrhash;
  52static unsigned int fib_hash_size;
  53static unsigned int fib_info_cnt;
  54
  55#define DEVINDEX_HASHBITS 8
  56#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
  57static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
  58
  59#ifdef CONFIG_IP_ROUTE_MULTIPATH
  60
  61static DEFINE_SPINLOCK(fib_multipath_lock);
  62
  63#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
  64for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
  65
  66#define change_nexthops(fi) { int nhsel; struct fib_nh *nexthop_nh; \
  67for (nhsel=0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nexthop_nh++, nhsel++)
  68
  69#else /* CONFIG_IP_ROUTE_MULTIPATH */
  70
  71/* Hope, that gcc will optimize it to get rid of dummy loop */
  72
  73#define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \
  74for (nhsel=0; nhsel < 1; nhsel++)
  75
  76#define change_nexthops(fi) { int nhsel = 0; struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
  77for (nhsel=0; nhsel < 1; nhsel++)
  78
  79#endif /* CONFIG_IP_ROUTE_MULTIPATH */
  80
  81#define endfor_nexthops(fi) }
  82
  83
  84static const struct
  85{
  86        int     error;
  87        u8      scope;
  88} fib_props[RTN_MAX + 1] = {
  89        {
  90                .error  = 0,
  91                .scope  = RT_SCOPE_NOWHERE,
  92        },      /* RTN_UNSPEC */
  93        {
  94                .error  = 0,
  95                .scope  = RT_SCOPE_UNIVERSE,
  96        },      /* RTN_UNICAST */
  97        {
  98                .error  = 0,
  99                .scope  = RT_SCOPE_HOST,
 100        },      /* RTN_LOCAL */
 101        {
 102                .error  = 0,
 103                .scope  = RT_SCOPE_LINK,
 104        },      /* RTN_BROADCAST */
 105        {
 106                .error  = 0,
 107                .scope  = RT_SCOPE_LINK,
 108        },      /* RTN_ANYCAST */
 109        {
 110                .error  = 0,
 111                .scope  = RT_SCOPE_UNIVERSE,
 112        },      /* RTN_MULTICAST */
 113        {
 114                .error  = -EINVAL,
 115                .scope  = RT_SCOPE_UNIVERSE,
 116        },      /* RTN_BLACKHOLE */
 117        {
 118                .error  = -EHOSTUNREACH,
 119                .scope  = RT_SCOPE_UNIVERSE,
 120        },      /* RTN_UNREACHABLE */
 121        {
 122                .error  = -EACCES,
 123                .scope  = RT_SCOPE_UNIVERSE,
 124        },      /* RTN_PROHIBIT */
 125        {
 126                .error  = -EAGAIN,
 127                .scope  = RT_SCOPE_UNIVERSE,
 128        },      /* RTN_THROW */
 129        {
 130                .error  = -EINVAL,
 131                .scope  = RT_SCOPE_NOWHERE,
 132        },      /* RTN_NAT */
 133        {
 134                .error  = -EINVAL,
 135                .scope  = RT_SCOPE_NOWHERE,
 136        },      /* RTN_XRESOLVE */
 137};
 138
 139
 140/* Release a nexthop info record */
 141
 142void free_fib_info(struct fib_info *fi)
 143{
 144        if (fi->fib_dead == 0) {
 145                printk(KERN_WARNING "Freeing alive fib_info %p\n", fi);
 146                return;
 147        }
 148        change_nexthops(fi) {
 149                if (nexthop_nh->nh_dev)
 150                        dev_put(nexthop_nh->nh_dev);
 151                nexthop_nh->nh_dev = NULL;
 152        } endfor_nexthops(fi);
 153        fib_info_cnt--;
 154        release_net(fi->fib_net);
 155        kfree(fi);
 156}
 157
 158void fib_release_info(struct fib_info *fi)
 159{
 160        spin_lock_bh(&fib_info_lock);
 161        if (fi && --fi->fib_treeref == 0) {
 162                hlist_del(&fi->fib_hash);
 163                if (fi->fib_prefsrc)
 164                        hlist_del(&fi->fib_lhash);
 165                change_nexthops(fi) {
 166                        if (!nexthop_nh->nh_dev)
 167                                continue;
 168                        hlist_del(&nexthop_nh->nh_hash);
 169                } endfor_nexthops(fi)
 170                fi->fib_dead = 1;
 171                fib_info_put(fi);
 172        }
 173        spin_unlock_bh(&fib_info_lock);
 174}
 175
 176static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
 177{
 178        const struct fib_nh *onh = ofi->fib_nh;
 179
 180        for_nexthops(fi) {
 181                if (nh->nh_oif != onh->nh_oif ||
 182                    nh->nh_gw  != onh->nh_gw ||
 183                    nh->nh_scope != onh->nh_scope ||
 184#ifdef CONFIG_IP_ROUTE_MULTIPATH
 185                    nh->nh_weight != onh->nh_weight ||
 186#endif
 187#ifdef CONFIG_NET_CLS_ROUTE
 188                    nh->nh_tclassid != onh->nh_tclassid ||
 189#endif
 190                    ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
 191                        return -1;
 192                onh++;
 193        } endfor_nexthops(fi);
 194        return 0;
 195}
 196
 197static inline unsigned int fib_devindex_hashfn(unsigned int val)
 198{
 199        unsigned int mask = DEVINDEX_HASHSIZE - 1;
 200
 201        return (val ^
 202                (val >> DEVINDEX_HASHBITS) ^
 203                (val >> (DEVINDEX_HASHBITS * 2))) & mask;
 204}
 205
 206static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
 207{
 208        unsigned int mask = (fib_hash_size - 1);
 209        unsigned int val = fi->fib_nhs;
 210
 211        val ^= fi->fib_protocol;
 212        val ^= (__force u32)fi->fib_prefsrc;
 213        val ^= fi->fib_priority;
 214        for_nexthops(fi) {
 215                val ^= fib_devindex_hashfn(nh->nh_oif);
 216        } endfor_nexthops(fi)
 217
 218        return (val ^ (val >> 7) ^ (val >> 12)) & mask;
 219}
 220
 221static struct fib_info *fib_find_info(const struct fib_info *nfi)
 222{
 223        struct hlist_head *head;
 224        struct hlist_node *node;
 225        struct fib_info *fi;
 226        unsigned int hash;
 227
 228        hash = fib_info_hashfn(nfi);
 229        head = &fib_info_hash[hash];
 230
 231        hlist_for_each_entry(fi, node, head, fib_hash) {
 232                if (!net_eq(fi->fib_net, nfi->fib_net))
 233                        continue;
 234                if (fi->fib_nhs != nfi->fib_nhs)
 235                        continue;
 236                if (nfi->fib_protocol == fi->fib_protocol &&
 237                    nfi->fib_prefsrc == fi->fib_prefsrc &&
 238                    nfi->fib_priority == fi->fib_priority &&
 239                    memcmp(nfi->fib_metrics, fi->fib_metrics,
 240                           sizeof(fi->fib_metrics)) == 0 &&
 241                    ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
 242                    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
 243                        return fi;
 244        }
 245
 246        return NULL;
 247}
 248
 249/* Check, that the gateway is already configured.
 250   Used only by redirect accept routine.
 251 */
 252
 253int ip_fib_check_default(__be32 gw, struct net_device *dev)
 254{
 255        struct hlist_head *head;
 256        struct hlist_node *node;
 257        struct fib_nh *nh;
 258        unsigned int hash;
 259
 260        spin_lock(&fib_info_lock);
 261
 262        hash = fib_devindex_hashfn(dev->ifindex);
 263        head = &fib_info_devhash[hash];
 264        hlist_for_each_entry(nh, node, head, nh_hash) {
 265                if (nh->nh_dev == dev &&
 266                    nh->nh_gw == gw &&
 267                    !(nh->nh_flags&RTNH_F_DEAD)) {
 268                        spin_unlock(&fib_info_lock);
 269                        return 0;
 270                }
 271        }
 272
 273        spin_unlock(&fib_info_lock);
 274
 275        return -1;
 276}
 277
 278static inline size_t fib_nlmsg_size(struct fib_info *fi)
 279{
 280        size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
 281                         + nla_total_size(4) /* RTA_TABLE */
 282                         + nla_total_size(4) /* RTA_DST */
 283                         + nla_total_size(4) /* RTA_PRIORITY */
 284                         + nla_total_size(4); /* RTA_PREFSRC */
 285
 286        /* space for nested metrics */
 287        payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
 288
 289        if (fi->fib_nhs) {
 290                /* Also handles the special case fib_nhs == 1 */
 291
 292                /* each nexthop is packed in an attribute */
 293                size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
 294
 295                /* may contain flow and gateway attribute */
 296                nhsize += 2 * nla_total_size(4);
 297
 298                /* all nexthops are packed in a nested attribute */
 299                payload += nla_total_size(fi->fib_nhs * nhsize);
 300        }
 301
 302        return payload;
 303}
 304
 305void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
 306               int dst_len, u32 tb_id, struct nl_info *info,
 307               unsigned int nlm_flags)
 308{
 309        struct sk_buff *skb;
 310        u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
 311        int err = -ENOBUFS;
 312
 313        skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
 314        if (skb == NULL)
 315                goto errout;
 316
 317        err = fib_dump_info(skb, info->pid, seq, event, tb_id,
 318                            fa->fa_type, fa->fa_scope, key, dst_len,
 319                            fa->fa_tos, fa->fa_info, nlm_flags);
 320        if (err < 0) {
 321                /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
 322                WARN_ON(err == -EMSGSIZE);
 323                kfree_skb(skb);
 324                goto errout;
 325        }
 326        rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
 327                    info->nlh, GFP_KERNEL);
 328        return;
 329errout:
 330        if (err < 0)
 331                rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
 332}
 333
 334/* Return the first fib alias matching TOS with
 335 * priority less than or equal to PRIO.
 336 */
 337struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
 338{
 339        if (fah) {
 340                struct fib_alias *fa;
 341                list_for_each_entry(fa, fah, fa_list) {
 342                        if (fa->fa_tos > tos)
 343                                continue;
 344                        if (fa->fa_info->fib_priority >= prio ||
 345                            fa->fa_tos < tos)
 346                                return fa;
 347                }
 348        }
 349        return NULL;
 350}
 351
 352int fib_detect_death(struct fib_info *fi, int order,
 353                     struct fib_info **last_resort, int *last_idx, int dflt)
 354{
 355        struct neighbour *n;
 356        int state = NUD_NONE;
 357
 358        n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
 359        if (n) {
 360                state = n->nud_state;
 361                neigh_release(n);
 362        }
 363        if (state == NUD_REACHABLE)
 364                return 0;
 365        if ((state&NUD_VALID) && order != dflt)
 366                return 0;
 367        if ((state&NUD_VALID) ||
 368            (*last_idx<0 && order > dflt)) {
 369                *last_resort = fi;
 370                *last_idx = order;
 371        }
 372        return 1;
 373}
 374
 375#ifdef CONFIG_IP_ROUTE_MULTIPATH
 376
 377static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
 378{
 379        int nhs = 0;
 380
 381        while (rtnh_ok(rtnh, remaining)) {
 382                nhs++;
 383                rtnh = rtnh_next(rtnh, &remaining);
 384        }
 385
 386        /* leftover implies invalid nexthop configuration, discard it */
 387        return remaining > 0 ? 0 : nhs;
 388}
 389
 390static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 391                       int remaining, struct fib_config *cfg)
 392{
 393        change_nexthops(fi) {
 394                int attrlen;
 395
 396                if (!rtnh_ok(rtnh, remaining))
 397                        return -EINVAL;
 398
 399                nexthop_nh->nh_flags =
 400                        (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
 401                nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
 402                nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
 403
 404                attrlen = rtnh_attrlen(rtnh);
 405                if (attrlen > 0) {
 406                        struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
 407
 408                        nla = nla_find(attrs, attrlen, RTA_GATEWAY);
 409                        nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0;
 410#ifdef CONFIG_NET_CLS_ROUTE
 411                        nla = nla_find(attrs, attrlen, RTA_FLOW);
 412                        nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
 413#endif
 414                }
 415
 416                rtnh = rtnh_next(rtnh, &remaining);
 417        } endfor_nexthops(fi);
 418
 419        return 0;
 420}
 421
 422#endif
 423
 424int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
 425{
 426#ifdef CONFIG_IP_ROUTE_MULTIPATH
 427        struct rtnexthop *rtnh;
 428        int remaining;
 429#endif
 430
 431        if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
 432                return 1;
 433
 434        if (cfg->fc_oif || cfg->fc_gw) {
 435                if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
 436                    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
 437                        return 0;
 438                return 1;
 439        }
 440
 441#ifdef CONFIG_IP_ROUTE_MULTIPATH
 442        if (cfg->fc_mp == NULL)
 443                return 0;
 444
 445        rtnh = cfg->fc_mp;
 446        remaining = cfg->fc_mp_len;
 447
 448        for_nexthops(fi) {
 449                int attrlen;
 450
 451                if (!rtnh_ok(rtnh, remaining))
 452                        return -EINVAL;
 453
 454                if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
 455                        return 1;
 456
 457                attrlen = rtnh_attrlen(rtnh);
 458                if (attrlen < 0) {
 459                        struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
 460
 461                        nla = nla_find(attrs, attrlen, RTA_GATEWAY);
 462                        if (nla && nla_get_be32(nla) != nh->nh_gw)
 463                                return 1;
 464#ifdef CONFIG_NET_CLS_ROUTE
 465                        nla = nla_find(attrs, attrlen, RTA_FLOW);
 466                        if (nla && nla_get_u32(nla) != nh->nh_tclassid)
 467                                return 1;
 468#endif
 469                }
 470
 471                rtnh = rtnh_next(rtnh, &remaining);
 472        } endfor_nexthops(fi);
 473#endif
 474        return 0;
 475}
 476
 477
 478/*
 479   Picture
 480   -------
 481
 482   Semantics of nexthop is very messy by historical reasons.
 483   We have to take into account, that:
 484   a) gateway can be actually local interface address,
 485      so that gatewayed route is direct.
 486   b) gateway must be on-link address, possibly
 487      described not by an ifaddr, but also by a direct route.
 488   c) If both gateway and interface are specified, they should not
 489      contradict.
 490   d) If we use tunnel routes, gateway could be not on-link.
 491
 492   Attempt to reconcile all of these (alas, self-contradictory) conditions
 493   results in pretty ugly and hairy code with obscure logic.
 494
 495   I chose to generalized it instead, so that the size
 496   of code does not increase practically, but it becomes
 497   much more general.
 498   Every prefix is assigned a "scope" value: "host" is local address,
 499   "link" is direct route,
 500   [ ... "site" ... "interior" ... ]
 501   and "universe" is true gateway route with global meaning.
 502
 503   Every prefix refers to a set of "nexthop"s (gw, oif),
 504   where gw must have narrower scope. This recursion stops
 505   when gw has LOCAL scope or if "nexthop" is declared ONLINK,
 506   which means that gw is forced to be on link.
 507
 508   Code is still hairy, but now it is apparently logically
 509   consistent and very flexible. F.e. as by-product it allows
 510   to co-exists in peace independent exterior and interior
 511   routing processes.
 512
 513   Normally it looks as following.
 514
 515   {universe prefix}  -> (gw, oif) [scope link]
 516                          |
 517                          |-> {link prefix} -> (gw, oif) [scope local]
 518                                                |
 519                                                |-> {local prefix} (terminal node)
 520 */
 521
 522static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
 523                        struct fib_nh *nh)
 524{
 525        int err;
 526        struct net *net;
 527
 528        net = cfg->fc_nlinfo.nl_net;
 529        if (nh->nh_gw) {
 530                struct fib_result res;
 531
 532                if (nh->nh_flags&RTNH_F_ONLINK) {
 533                        struct net_device *dev;
 534
 535                        if (cfg->fc_scope >= RT_SCOPE_LINK)
 536                                return -EINVAL;
 537                        if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
 538                                return -EINVAL;
 539                        if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
 540                                return -ENODEV;
 541                        if (!(dev->flags&IFF_UP))
 542                                return -ENETDOWN;
 543                        nh->nh_dev = dev;
 544                        dev_hold(dev);
 545                        nh->nh_scope = RT_SCOPE_LINK;
 546                        return 0;
 547                }
 548                {
 549                        struct flowi fl = {
 550                                .nl_u = {
 551                                        .ip4_u = {
 552                                                .daddr = nh->nh_gw,
 553                                                .scope = cfg->fc_scope + 1,
 554                                        },
 555                                },
 556                                .oif = nh->nh_oif,
 557                        };
 558
 559                        /* It is not necessary, but requires a bit of thinking */
 560                        if (fl.fl4_scope < RT_SCOPE_LINK)
 561                                fl.fl4_scope = RT_SCOPE_LINK;
 562                        if ((err = fib_lookup(net, &fl, &res)) != 0)
 563                                return err;
 564                }
 565                err = -EINVAL;
 566                if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
 567                        goto out;
 568                nh->nh_scope = res.scope;
 569                nh->nh_oif = FIB_RES_OIF(res);
 570                if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
 571                        goto out;
 572                dev_hold(nh->nh_dev);
 573                err = -ENETDOWN;
 574                if (!(nh->nh_dev->flags & IFF_UP))
 575                        goto out;
 576                err = 0;
 577out:
 578                fib_res_put(&res);
 579                return err;
 580        } else {
 581                struct in_device *in_dev;
 582
 583                if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
 584                        return -EINVAL;
 585
 586                in_dev = inetdev_by_index(net, nh->nh_oif);
 587                if (in_dev == NULL)
 588                        return -ENODEV;
 589                if (!(in_dev->dev->flags&IFF_UP)) {
 590                        in_dev_put(in_dev);
 591                        return -ENETDOWN;
 592                }
 593                nh->nh_dev = in_dev->dev;
 594                dev_hold(nh->nh_dev);
 595                nh->nh_scope = RT_SCOPE_HOST;
 596                in_dev_put(in_dev);
 597        }
 598        return 0;
 599}
 600
 601static inline unsigned int fib_laddr_hashfn(__be32 val)
 602{
 603        unsigned int mask = (fib_hash_size - 1);
 604
 605        return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
 606}
 607
 608static struct hlist_head *fib_hash_alloc(int bytes)
 609{
 610        if (bytes <= PAGE_SIZE)
 611                return kzalloc(bytes, GFP_KERNEL);
 612        else
 613                return (struct hlist_head *)
 614                        __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes));
 615}
 616
 617static void fib_hash_free(struct hlist_head *hash, int bytes)
 618{
 619        if (!hash)
 620                return;
 621
 622        if (bytes <= PAGE_SIZE)
 623                kfree(hash);
 624        else
 625                free_pages((unsigned long) hash, get_order(bytes));
 626}
 627
 628static void fib_hash_move(struct hlist_head *new_info_hash,
 629                          struct hlist_head *new_laddrhash,
 630                          unsigned int new_size)
 631{
 632        struct hlist_head *old_info_hash, *old_laddrhash;
 633        unsigned int old_size = fib_hash_size;
 634        unsigned int i, bytes;
 635
 636        spin_lock_bh(&fib_info_lock);
 637        old_info_hash = fib_info_hash;
 638        old_laddrhash = fib_info_laddrhash;
 639        fib_hash_size = new_size;
 640
 641        for (i = 0; i < old_size; i++) {
 642                struct hlist_head *head = &fib_info_hash[i];
 643                struct hlist_node *node, *n;
 644                struct fib_info *fi;
 645
 646                hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
 647                        struct hlist_head *dest;
 648                        unsigned int new_hash;
 649
 650                        hlist_del(&fi->fib_hash);
 651
 652                        new_hash = fib_info_hashfn(fi);
 653                        dest = &new_info_hash[new_hash];
 654                        hlist_add_head(&fi->fib_hash, dest);
 655                }
 656        }
 657        fib_info_hash = new_info_hash;
 658
 659        for (i = 0; i < old_size; i++) {
 660                struct hlist_head *lhead = &fib_info_laddrhash[i];
 661                struct hlist_node *node, *n;
 662                struct fib_info *fi;
 663
 664                hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
 665                        struct hlist_head *ldest;
 666                        unsigned int new_hash;
 667
 668                        hlist_del(&fi->fib_lhash);
 669
 670                        new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
 671                        ldest = &new_laddrhash[new_hash];
 672                        hlist_add_head(&fi->fib_lhash, ldest);
 673                }
 674        }
 675        fib_info_laddrhash = new_laddrhash;
 676
 677        spin_unlock_bh(&fib_info_lock);
 678
 679        bytes = old_size * sizeof(struct hlist_head *);
 680        fib_hash_free(old_info_hash, bytes);
 681        fib_hash_free(old_laddrhash, bytes);
 682}
 683
 684struct fib_info *fib_create_info(struct fib_config *cfg)
 685{
 686        int err;
 687        struct fib_info *fi = NULL;
 688        struct fib_info *ofi;
 689        int nhs = 1;
 690        struct net *net = cfg->fc_nlinfo.nl_net;
 691
 692        /* Fast check to catch the most weird cases */
 693        if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
 694                goto err_inval;
 695
 696#ifdef CONFIG_IP_ROUTE_MULTIPATH
 697        if (cfg->fc_mp) {
 698                nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
 699                if (nhs == 0)
 700                        goto err_inval;
 701        }
 702#endif
 703
 704        err = -ENOBUFS;
 705        if (fib_info_cnt >= fib_hash_size) {
 706                unsigned int new_size = fib_hash_size << 1;
 707                struct hlist_head *new_info_hash;
 708                struct hlist_head *new_laddrhash;
 709                unsigned int bytes;
 710
 711                if (!new_size)
 712                        new_size = 1;
 713                bytes = new_size * sizeof(struct hlist_head *);
 714                new_info_hash = fib_hash_alloc(bytes);
 715                new_laddrhash = fib_hash_alloc(bytes);
 716                if (!new_info_hash || !new_laddrhash) {
 717                        fib_hash_free(new_info_hash, bytes);
 718                        fib_hash_free(new_laddrhash, bytes);
 719                } else
 720                        fib_hash_move(new_info_hash, new_laddrhash, new_size);
 721
 722                if (!fib_hash_size)
 723                        goto failure;
 724        }
 725
 726        fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
 727        if (fi == NULL)
 728                goto failure;
 729        fib_info_cnt++;
 730
 731        fi->fib_net = hold_net(net);
 732        fi->fib_protocol = cfg->fc_protocol;
 733        fi->fib_flags = cfg->fc_flags;
 734        fi->fib_priority = cfg->fc_priority;
 735        fi->fib_prefsrc = cfg->fc_prefsrc;
 736
 737        fi->fib_nhs = nhs;
 738        change_nexthops(fi) {
 739                nexthop_nh->nh_parent = fi;
 740        } endfor_nexthops(fi)
 741
 742        if (cfg->fc_mx) {
 743                struct nlattr *nla;
 744                int remaining;
 745
 746                nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
 747                        int type = nla_type(nla);
 748
 749                        if (type) {
 750                                if (type > RTAX_MAX)
 751                                        goto err_inval;
 752                                fi->fib_metrics[type - 1] = nla_get_u32(nla);
 753                        }
 754                }
 755        }
 756
 757        if (cfg->fc_mp) {
 758#ifdef CONFIG_IP_ROUTE_MULTIPATH
 759                err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
 760                if (err != 0)
 761                        goto failure;
 762                if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
 763                        goto err_inval;
 764                if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
 765                        goto err_inval;
 766#ifdef CONFIG_NET_CLS_ROUTE
 767                if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
 768                        goto err_inval;
 769#endif
 770#else
 771                goto err_inval;
 772#endif
 773        } else {
 774                struct fib_nh *nh = fi->fib_nh;
 775
 776                nh->nh_oif = cfg->fc_oif;
 777                nh->nh_gw = cfg->fc_gw;
 778                nh->nh_flags = cfg->fc_flags;
 779#ifdef CONFIG_NET_CLS_ROUTE
 780                nh->nh_tclassid = cfg->fc_flow;
 781#endif
 782#ifdef CONFIG_IP_ROUTE_MULTIPATH
 783                nh->nh_weight = 1;
 784#endif
 785        }
 786
 787        if (fib_props[cfg->fc_type].error) {
 788                if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
 789                        goto err_inval;
 790                goto link_it;
 791        }
 792
 793        if (cfg->fc_scope > RT_SCOPE_HOST)
 794                goto err_inval;
 795
 796        if (cfg->fc_scope == RT_SCOPE_HOST) {
 797                struct fib_nh *nh = fi->fib_nh;
 798
 799                /* Local address is added. */
 800                if (nhs != 1 || nh->nh_gw)
 801                        goto err_inval;
 802                nh->nh_scope = RT_SCOPE_NOWHERE;
 803                nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
 804                err = -ENODEV;
 805                if (nh->nh_dev == NULL)
 806                        goto failure;
 807        } else {
 808                change_nexthops(fi) {
 809                        if ((err = fib_check_nh(cfg, fi, nexthop_nh)) != 0)
 810                                goto failure;
 811                } endfor_nexthops(fi)
 812        }
 813
 814        if (fi->fib_prefsrc) {
 815                if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
 816                    fi->fib_prefsrc != cfg->fc_dst)
 817                        if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
 818                                goto err_inval;
 819        }
 820
 821link_it:
 822        if ((ofi = fib_find_info(fi)) != NULL) {
 823                fi->fib_dead = 1;
 824                free_fib_info(fi);
 825                ofi->fib_treeref++;
 826                return ofi;
 827        }
 828
 829        fi->fib_treeref++;
 830        atomic_inc(&fi->fib_clntref);
 831        spin_lock_bh(&fib_info_lock);
 832        hlist_add_head(&fi->fib_hash,
 833                       &fib_info_hash[fib_info_hashfn(fi)]);
 834        if (fi->fib_prefsrc) {
 835                struct hlist_head *head;
 836
 837                head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
 838                hlist_add_head(&fi->fib_lhash, head);
 839        }
 840        change_nexthops(fi) {
 841                struct hlist_head *head;
 842                unsigned int hash;
 843
 844                if (!nexthop_nh->nh_dev)
 845                        continue;
 846                hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex);
 847                head = &fib_info_devhash[hash];
 848                hlist_add_head(&nexthop_nh->nh_hash, head);
 849        } endfor_nexthops(fi)
 850        spin_unlock_bh(&fib_info_lock);
 851        return fi;
 852
 853err_inval:
 854        err = -EINVAL;
 855
 856failure:
 857        if (fi) {
 858                fi->fib_dead = 1;
 859                free_fib_info(fi);
 860        }
 861
 862        return ERR_PTR(err);
 863}
 864
 865/* Note! fib_semantic_match intentionally uses  RCU list functions. */
 866int fib_semantic_match(struct list_head *head, const struct flowi *flp,
 867                       struct fib_result *res, int prefixlen)
 868{
 869        struct fib_alias *fa;
 870        int nh_sel = 0;
 871
 872        list_for_each_entry_rcu(fa, head, fa_list) {
 873                int err;
 874
 875                if (fa->fa_tos &&
 876                    fa->fa_tos != flp->fl4_tos)
 877                        continue;
 878
 879                if (fa->fa_scope < flp->fl4_scope)
 880                        continue;
 881
 882                fa->fa_state |= FA_S_ACCESSED;
 883
 884                err = fib_props[fa->fa_type].error;
 885                if (err == 0) {
 886                        struct fib_info *fi = fa->fa_info;
 887
 888                        if (fi->fib_flags & RTNH_F_DEAD)
 889                                continue;
 890
 891                        switch (fa->fa_type) {
 892                        case RTN_UNICAST:
 893                        case RTN_LOCAL:
 894                        case RTN_BROADCAST:
 895                        case RTN_ANYCAST:
 896                        case RTN_MULTICAST:
 897                                for_nexthops(fi) {
 898                                        if (nh->nh_flags&RTNH_F_DEAD)
 899                                                continue;
 900                                        if (!flp->oif || flp->oif == nh->nh_oif)
 901                                                break;
 902                                }
 903#ifdef CONFIG_IP_ROUTE_MULTIPATH
 904                                if (nhsel < fi->fib_nhs) {
 905                                        nh_sel = nhsel;
 906                                        goto out_fill_res;
 907                                }
 908#else
 909                                if (nhsel < 1) {
 910                                        goto out_fill_res;
 911                                }
 912#endif
 913                                endfor_nexthops(fi);
 914                                continue;
 915
 916                        default:
 917                                printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
 918                                        fa->fa_type);
 919                                return -EINVAL;
 920                        }
 921                }
 922                return err;
 923        }
 924        return 1;
 925
 926out_fill_res:
 927        res->prefixlen = prefixlen;
 928        res->nh_sel = nh_sel;
 929        res->type = fa->fa_type;
 930        res->scope = fa->fa_scope;
 931        res->fi = fa->fa_info;
 932        atomic_inc(&res->fi->fib_clntref);
 933        return 0;
 934}
 935
 936/* Find appropriate source address to this destination */
 937
 938__be32 __fib_res_prefsrc(struct fib_result *res)
 939{
 940        return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
 941}
 942
 943int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
 944                  u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
 945                  struct fib_info *fi, unsigned int flags)
 946{
 947        struct nlmsghdr *nlh;
 948        struct rtmsg *rtm;
 949
 950        nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
 951        if (nlh == NULL)
 952                return -EMSGSIZE;
 953
 954        rtm = nlmsg_data(nlh);
 955        rtm->rtm_family = AF_INET;
 956        rtm->rtm_dst_len = dst_len;
 957        rtm->rtm_src_len = 0;
 958        rtm->rtm_tos = tos;
 959        if (tb_id < 256)
 960                rtm->rtm_table = tb_id;
 961        else
 962                rtm->rtm_table = RT_TABLE_COMPAT;
 963        NLA_PUT_U32(skb, RTA_TABLE, tb_id);
 964        rtm->rtm_type = type;
 965        rtm->rtm_flags = fi->fib_flags;
 966        rtm->rtm_scope = scope;
 967        rtm->rtm_protocol = fi->fib_protocol;
 968
 969        if (rtm->rtm_dst_len)
 970                NLA_PUT_BE32(skb, RTA_DST, dst);
 971
 972        if (fi->fib_priority)
 973                NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
 974
 975        if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
 976                goto nla_put_failure;
 977
 978        if (fi->fib_prefsrc)
 979                NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
 980
 981        if (fi->fib_nhs == 1) {
 982                if (fi->fib_nh->nh_gw)
 983                        NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
 984
 985                if (fi->fib_nh->nh_oif)
 986                        NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
 987#ifdef CONFIG_NET_CLS_ROUTE
 988                if (fi->fib_nh[0].nh_tclassid)
 989                        NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
 990#endif
 991        }
 992#ifdef CONFIG_IP_ROUTE_MULTIPATH
 993        if (fi->fib_nhs > 1) {
 994                struct rtnexthop *rtnh;
 995                struct nlattr *mp;
 996
 997                mp = nla_nest_start(skb, RTA_MULTIPATH);
 998                if (mp == NULL)
 999                        goto nla_put_failure;
1000
1001                for_nexthops(fi) {
1002                        rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1003                        if (rtnh == NULL)
1004                                goto nla_put_failure;
1005
1006                        rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1007                        rtnh->rtnh_hops = nh->nh_weight - 1;
1008                        rtnh->rtnh_ifindex = nh->nh_oif;
1009
1010                        if (nh->nh_gw)
1011                                NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1012#ifdef CONFIG_NET_CLS_ROUTE
1013                        if (nh->nh_tclassid)
1014                                NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1015#endif
1016                        /* length of rtnetlink header + attributes */
1017                        rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1018                } endfor_nexthops(fi);
1019
1020                nla_nest_end(skb, mp);
1021        }
1022#endif
1023        return nlmsg_end(skb, nlh);
1024
1025nla_put_failure:
1026        nlmsg_cancel(skb, nlh);
1027        return -EMSGSIZE;
1028}
1029
1030/*
1031   Update FIB if:
1032   - local address disappeared -> we must delete all the entries
1033     referring to it.
1034   - device went down -> we must shutdown all nexthops going via it.
1035 */
1036int fib_sync_down_addr(struct net *net, __be32 local)
1037{
1038        int ret = 0;
1039        unsigned int hash = fib_laddr_hashfn(local);
1040        struct hlist_head *head = &fib_info_laddrhash[hash];
1041        struct hlist_node *node;
1042        struct fib_info *fi;
1043
1044        if (fib_info_laddrhash == NULL || local == 0)
1045                return 0;
1046
1047        hlist_for_each_entry(fi, node, head, fib_lhash) {
1048                if (!net_eq(fi->fib_net, net))
1049                        continue;
1050                if (fi->fib_prefsrc == local) {
1051                        fi->fib_flags |= RTNH_F_DEAD;
1052                        ret++;
1053                }
1054        }
1055        return ret;
1056}
1057
1058int fib_sync_down_dev(struct net_device *dev, int force)
1059{
1060        int ret = 0;
1061        int scope = RT_SCOPE_NOWHERE;
1062        struct fib_info *prev_fi = NULL;
1063        unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1064        struct hlist_head *head = &fib_info_devhash[hash];
1065        struct hlist_node *node;
1066        struct fib_nh *nh;
1067
1068        if (force)
1069                scope = -1;
1070
1071        hlist_for_each_entry(nh, node, head, nh_hash) {
1072                struct fib_info *fi = nh->nh_parent;
1073                int dead;
1074
1075                BUG_ON(!fi->fib_nhs);
1076                if (nh->nh_dev != dev || fi == prev_fi)
1077                        continue;
1078                prev_fi = fi;
1079                dead = 0;
1080                change_nexthops(fi) {
1081                        if (nexthop_nh->nh_flags&RTNH_F_DEAD)
1082                                dead++;
1083                        else if (nexthop_nh->nh_dev == dev &&
1084                                 nexthop_nh->nh_scope != scope) {
1085                                nexthop_nh->nh_flags |= RTNH_F_DEAD;
1086#ifdef CONFIG_IP_ROUTE_MULTIPATH
1087                                spin_lock_bh(&fib_multipath_lock);
1088                                fi->fib_power -= nexthop_nh->nh_power;
1089                                nexthop_nh->nh_power = 0;
1090                                spin_unlock_bh(&fib_multipath_lock);
1091#endif
1092                                dead++;
1093                        }
1094#ifdef CONFIG_IP_ROUTE_MULTIPATH
1095                        if (force > 1 && nexthop_nh->nh_dev == dev) {
1096                                dead = fi->fib_nhs;
1097                                break;
1098                        }
1099#endif
1100                } endfor_nexthops(fi)
1101                if (dead == fi->fib_nhs) {
1102                        fi->fib_flags |= RTNH_F_DEAD;
1103                        ret++;
1104                }
1105        }
1106
1107        return ret;
1108}
1109
1110#ifdef CONFIG_IP_ROUTE_MULTIPATH
1111
1112/*
1113   Dead device goes up. We wake up dead nexthops.
1114   It takes sense only on multipath routes.
1115 */
1116
1117int fib_sync_up(struct net_device *dev)
1118{
1119        struct fib_info *prev_fi;
1120        unsigned int hash;
1121        struct hlist_head *head;
1122        struct hlist_node *node;
1123        struct fib_nh *nh;
1124        int ret;
1125
1126        if (!(dev->flags&IFF_UP))
1127                return 0;
1128
1129        prev_fi = NULL;
1130        hash = fib_devindex_hashfn(dev->ifindex);
1131        head = &fib_info_devhash[hash];
1132        ret = 0;
1133
1134        hlist_for_each_entry(nh, node, head, nh_hash) {
1135                struct fib_info *fi = nh->nh_parent;
1136                int alive;
1137
1138                BUG_ON(!fi->fib_nhs);
1139                if (nh->nh_dev != dev || fi == prev_fi)
1140                        continue;
1141
1142                prev_fi = fi;
1143                alive = 0;
1144                change_nexthops(fi) {
1145                        if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
1146                                alive++;
1147                                continue;
1148                        }
1149                        if (nexthop_nh->nh_dev == NULL ||
1150                            !(nexthop_nh->nh_dev->flags&IFF_UP))
1151                                continue;
1152                        if (nexthop_nh->nh_dev != dev ||
1153                            !__in_dev_get_rtnl(dev))
1154                                continue;
1155                        alive++;
1156                        spin_lock_bh(&fib_multipath_lock);
1157                        nexthop_nh->nh_power = 0;
1158                        nexthop_nh->nh_flags &= ~RTNH_F_DEAD;
1159                        spin_unlock_bh(&fib_multipath_lock);
1160                } endfor_nexthops(fi)
1161
1162                if (alive > 0) {
1163                        fi->fib_flags &= ~RTNH_F_DEAD;
1164                        ret++;
1165                }
1166        }
1167
1168        return ret;
1169}
1170
1171/*
1172   The algorithm is suboptimal, but it provides really
1173   fair weighted route distribution.
1174 */
1175
1176void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1177{
1178        struct fib_info *fi = res->fi;
1179        int w;
1180
1181        spin_lock_bh(&fib_multipath_lock);
1182        if (fi->fib_power <= 0) {
1183                int power = 0;
1184                change_nexthops(fi) {
1185                        if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
1186                                power += nexthop_nh->nh_weight;
1187                                nexthop_nh->nh_power = nexthop_nh->nh_weight;
1188                        }
1189                } endfor_nexthops(fi);
1190                fi->fib_power = power;
1191                if (power <= 0) {
1192                        spin_unlock_bh(&fib_multipath_lock);
1193                        /* Race condition: route has just become dead. */
1194                        res->nh_sel = 0;
1195                        return;
1196                }
1197        }
1198
1199
1200        /* w should be random number [0..fi->fib_power-1],
1201           it is pretty bad approximation.
1202         */
1203
1204        w = jiffies % fi->fib_power;
1205
1206        change_nexthops(fi) {
1207                if (!(nexthop_nh->nh_flags&RTNH_F_DEAD) &&
1208                    nexthop_nh->nh_power) {
1209                        if ((w -= nexthop_nh->nh_power) <= 0) {
1210                                nexthop_nh->nh_power--;
1211                                fi->fib_power--;
1212                                res->nh_sel = nhsel;
1213                                spin_unlock_bh(&fib_multipath_lock);
1214                                return;
1215                        }
1216                }
1217        } endfor_nexthops(fi);
1218
1219        /* Race condition: route has just become dead. */
1220        res->nh_sel = 0;
1221        spin_unlock_bh(&fib_multipath_lock);
1222}
1223#endif
1224