linux/net/ipv4/fib_trie.c
<<
>>
Prefs
   1/*
   2 *   This program is free software; you can redistribute it and/or
   3 *   modify it under the terms of the GNU General Public License
   4 *   as published by the Free Software Foundation; either version
   5 *   2 of the License, or (at your option) any later version.
   6 *
   7 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
   8 *     & Swedish University of Agricultural Sciences.
   9 *
  10 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
  11 *     Agricultural Sciences.
  12 *
  13 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
  14 *
  15 * This work is based on the LPC-trie which is originally descibed in:
  16 *
  17 * An experimental study of compression methods for dynamic tries
  18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
  19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
  20 *
  21 *
  22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
  23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
  24 *
  25 *
  26 * Code from fib_hash has been reused which includes the following header:
  27 *
  28 *
  29 * INET         An implementation of the TCP/IP protocol suite for the LINUX
  30 *              operating system.  INET is implemented using the  BSD Socket
  31 *              interface as the means of communication with the user level.
  32 *
  33 *              IPv4 FIB: lookup engine and maintenance routines.
  34 *
  35 *
  36 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  37 *
  38 *              This program is free software; you can redistribute it and/or
  39 *              modify it under the terms of the GNU General Public License
  40 *              as published by the Free Software Foundation; either version
  41 *              2 of the License, or (at your option) any later version.
  42 *
  43 * Substantial contributions to this work comes from:
  44 *
  45 *              David S. Miller, <davem@davemloft.net>
  46 *              Stephen Hemminger <shemminger@osdl.org>
  47 *              Paul E. McKenney <paulmck@us.ibm.com>
  48 *              Patrick McHardy <kaber@trash.net>
  49 */
  50
  51#define VERSION "0.408"
  52
  53#include <asm/uaccess.h>
  54#include <asm/system.h>
  55#include <linux/bitops.h>
  56#include <linux/types.h>
  57#include <linux/kernel.h>
  58#include <linux/mm.h>
  59#include <linux/string.h>
  60#include <linux/socket.h>
  61#include <linux/sockios.h>
  62#include <linux/errno.h>
  63#include <linux/in.h>
  64#include <linux/inet.h>
  65#include <linux/inetdevice.h>
  66#include <linux/netdevice.h>
  67#include <linux/if_arp.h>
  68#include <linux/proc_fs.h>
  69#include <linux/rcupdate.h>
  70#include <linux/skbuff.h>
  71#include <linux/netlink.h>
  72#include <linux/init.h>
  73#include <linux/list.h>
  74#include <net/net_namespace.h>
  75#include <net/ip.h>
  76#include <net/protocol.h>
  77#include <net/route.h>
  78#include <net/tcp.h>
  79#include <net/sock.h>
  80#include <net/ip_fib.h>
  81#include "fib_lookup.h"
  82
  83#define MAX_STAT_DEPTH 32
  84
  85#define KEYLENGTH (8*sizeof(t_key))
  86
  87typedef unsigned int t_key;
  88
  89#define T_TNODE 0
  90#define T_LEAF  1
  91#define NODE_TYPE_MASK  0x1UL
  92#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
  93
  94#define IS_TNODE(n) (!(n->parent & T_LEAF))
  95#define IS_LEAF(n) (n->parent & T_LEAF)
  96
  97struct node {
  98        unsigned long parent;
  99        t_key key;
 100};
 101
 102struct leaf {
 103        unsigned long parent;
 104        t_key key;
 105        struct hlist_head list;
 106        struct rcu_head rcu;
 107};
 108
 109struct leaf_info {
 110        struct hlist_node hlist;
 111        struct rcu_head rcu;
 112        int plen;
 113        struct list_head falh;
 114};
 115
 116struct tnode {
 117        unsigned long parent;
 118        t_key key;
 119        unsigned char pos;              /* 2log(KEYLENGTH) bits needed */
 120        unsigned char bits;             /* 2log(KEYLENGTH) bits needed */
 121        unsigned int full_children;     /* KEYLENGTH bits needed */
 122        unsigned int empty_children;    /* KEYLENGTH bits needed */
 123        union {
 124                struct rcu_head rcu;
 125                struct work_struct work;
 126                struct tnode *tnode_free;
 127        };
 128        struct node *child[0];
 129};
 130
 131#ifdef CONFIG_IP_FIB_TRIE_STATS
 132struct trie_use_stats {
 133        unsigned int gets;
 134        unsigned int backtrack;
 135        unsigned int semantic_match_passed;
 136        unsigned int semantic_match_miss;
 137        unsigned int null_node_hit;
 138        unsigned int resize_node_skipped;
 139};
 140#endif
 141
 142struct trie_stat {
 143        unsigned int totdepth;
 144        unsigned int maxdepth;
 145        unsigned int tnodes;
 146        unsigned int leaves;
 147        unsigned int nullpointers;
 148        unsigned int prefixes;
 149        unsigned int nodesizes[MAX_STAT_DEPTH];
 150};
 151
 152struct trie {
 153        struct node *trie;
 154#ifdef CONFIG_IP_FIB_TRIE_STATS
 155        struct trie_use_stats stats;
 156#endif
 157};
 158
 159static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
 160static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
 161                                  int wasfull);
 162static struct node *resize(struct trie *t, struct tnode *tn);
 163static struct tnode *inflate(struct trie *t, struct tnode *tn);
 164static struct tnode *halve(struct trie *t, struct tnode *tn);
 165/* tnodes to free after resize(); protected by RTNL */
 166static struct tnode *tnode_free_head;
 167
 168static struct kmem_cache *fn_alias_kmem __read_mostly;
 169static struct kmem_cache *trie_leaf_kmem __read_mostly;
 170
 171static inline struct tnode *node_parent(struct node *node)
 172{
 173        return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
 174}
 175
 176static inline struct tnode *node_parent_rcu(struct node *node)
 177{
 178        struct tnode *ret = node_parent(node);
 179
 180        return rcu_dereference(ret);
 181}
 182
 183/* Same as rcu_assign_pointer
 184 * but that macro() assumes that value is a pointer.
 185 */
 186static inline void node_set_parent(struct node *node, struct tnode *ptr)
 187{
 188        smp_wmb();
 189        node->parent = (unsigned long)ptr | NODE_TYPE(node);
 190}
 191
 192static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
 193{
 194        BUG_ON(i >= 1U << tn->bits);
 195
 196        return tn->child[i];
 197}
 198
 199static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
 200{
 201        struct node *ret = tnode_get_child(tn, i);
 202
 203        return rcu_dereference(ret);
 204}
 205
 206static inline int tnode_child_length(const struct tnode *tn)
 207{
 208        return 1 << tn->bits;
 209}
 210
 211static inline t_key mask_pfx(t_key k, unsigned short l)
 212{
 213        return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
 214}
 215
 216static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
 217{
 218        if (offset < KEYLENGTH)
 219                return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
 220        else
 221                return 0;
 222}
 223
 224static inline int tkey_equals(t_key a, t_key b)
 225{
 226        return a == b;
 227}
 228
 229static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
 230{
 231        if (bits == 0 || offset >= KEYLENGTH)
 232                return 1;
 233        bits = bits > KEYLENGTH ? KEYLENGTH : bits;
 234        return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
 235}
 236
 237static inline int tkey_mismatch(t_key a, int offset, t_key b)
 238{
 239        t_key diff = a ^ b;
 240        int i = offset;
 241
 242        if (!diff)
 243                return 0;
 244        while ((diff << i) >> (KEYLENGTH-1) == 0)
 245                i++;
 246        return i;
 247}
 248
 249/*
 250  To understand this stuff, an understanding of keys and all their bits is
 251  necessary. Every node in the trie has a key associated with it, but not
 252  all of the bits in that key are significant.
 253
 254  Consider a node 'n' and its parent 'tp'.
 255
 256  If n is a leaf, every bit in its key is significant. Its presence is
 257  necessitated by path compression, since during a tree traversal (when
 258  searching for a leaf - unless we are doing an insertion) we will completely
 259  ignore all skipped bits we encounter. Thus we need to verify, at the end of
 260  a potentially successful search, that we have indeed been walking the
 261  correct key path.
 262
 263  Note that we can never "miss" the correct key in the tree if present by
 264  following the wrong path. Path compression ensures that segments of the key
 265  that are the same for all keys with a given prefix are skipped, but the
 266  skipped part *is* identical for each node in the subtrie below the skipped
 267  bit! trie_insert() in this implementation takes care of that - note the
 268  call to tkey_sub_equals() in trie_insert().
 269
 270  if n is an internal node - a 'tnode' here, the various parts of its key
 271  have many different meanings.
 272
 273  Example:
 274  _________________________________________________________________
 275  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
 276  -----------------------------------------------------------------
 277    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 278
 279  _________________________________________________________________
 280  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
 281  -----------------------------------------------------------------
 282   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
 283
 284  tp->pos = 7
 285  tp->bits = 3
 286  n->pos = 15
 287  n->bits = 4
 288
 289  First, let's just ignore the bits that come before the parent tp, that is
 290  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
 291  not use them for anything.
 292
 293  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
 294  index into the parent's child array. That is, they will be used to find
 295  'n' among tp's children.
 296
 297  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
 298  for the node n.
 299
 300  All the bits we have seen so far are significant to the node n. The rest
 301  of the bits are really not needed or indeed known in n->key.
 302
 303  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
 304  n's child array, and will of course be different for each child.
 305
 306
 307  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
 308  at this point.
 309
 310*/
 311
 312static inline void check_tnode(const struct tnode *tn)
 313{
 314        WARN_ON(tn && tn->pos+tn->bits > 32);
 315}
 316
 317static const int halve_threshold = 25;
 318static const int inflate_threshold = 50;
 319static const int halve_threshold_root = 15;
 320static const int inflate_threshold_root = 25;
 321
 322
 323static void __alias_free_mem(struct rcu_head *head)
 324{
 325        struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
 326        kmem_cache_free(fn_alias_kmem, fa);
 327}
 328
 329static inline void alias_free_mem_rcu(struct fib_alias *fa)
 330{
 331        call_rcu(&fa->rcu, __alias_free_mem);
 332}
 333
 334static void __leaf_free_rcu(struct rcu_head *head)
 335{
 336        struct leaf *l = container_of(head, struct leaf, rcu);
 337        kmem_cache_free(trie_leaf_kmem, l);
 338}
 339
 340static inline void free_leaf(struct leaf *l)
 341{
 342        call_rcu_bh(&l->rcu, __leaf_free_rcu);
 343}
 344
 345static void __leaf_info_free_rcu(struct rcu_head *head)
 346{
 347        kfree(container_of(head, struct leaf_info, rcu));
 348}
 349
 350static inline void free_leaf_info(struct leaf_info *leaf)
 351{
 352        call_rcu(&leaf->rcu, __leaf_info_free_rcu);
 353}
 354
 355static struct tnode *tnode_alloc(size_t size)
 356{
 357        if (size <= PAGE_SIZE)
 358                return kzalloc(size, GFP_KERNEL);
 359        else
 360                return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
 361}
 362
 363static void __tnode_vfree(struct work_struct *arg)
 364{
 365        struct tnode *tn = container_of(arg, struct tnode, work);
 366        vfree(tn);
 367}
 368
 369static void __tnode_free_rcu(struct rcu_head *head)
 370{
 371        struct tnode *tn = container_of(head, struct tnode, rcu);
 372        size_t size = sizeof(struct tnode) +
 373                      (sizeof(struct node *) << tn->bits);
 374
 375        if (size <= PAGE_SIZE)
 376                kfree(tn);
 377        else {
 378                INIT_WORK(&tn->work, __tnode_vfree);
 379                schedule_work(&tn->work);
 380        }
 381}
 382
 383static inline void tnode_free(struct tnode *tn)
 384{
 385        if (IS_LEAF(tn))
 386                free_leaf((struct leaf *) tn);
 387        else
 388                call_rcu(&tn->rcu, __tnode_free_rcu);
 389}
 390
 391static void tnode_free_safe(struct tnode *tn)
 392{
 393        BUG_ON(IS_LEAF(tn));
 394        tn->tnode_free = tnode_free_head;
 395        tnode_free_head = tn;
 396}
 397
 398static void tnode_free_flush(void)
 399{
 400        struct tnode *tn;
 401
 402        while ((tn = tnode_free_head)) {
 403                tnode_free_head = tn->tnode_free;
 404                tn->tnode_free = NULL;
 405                tnode_free(tn);
 406        }
 407}
 408
 409static struct leaf *leaf_new(void)
 410{
 411        struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
 412        if (l) {
 413                l->parent = T_LEAF;
 414                INIT_HLIST_HEAD(&l->list);
 415        }
 416        return l;
 417}
 418
 419static struct leaf_info *leaf_info_new(int plen)
 420{
 421        struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
 422        if (li) {
 423                li->plen = plen;
 424                INIT_LIST_HEAD(&li->falh);
 425        }
 426        return li;
 427}
 428
 429static struct tnode *tnode_new(t_key key, int pos, int bits)
 430{
 431        size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
 432        struct tnode *tn = tnode_alloc(sz);
 433
 434        if (tn) {
 435                tn->parent = T_TNODE;
 436                tn->pos = pos;
 437                tn->bits = bits;
 438                tn->key = key;
 439                tn->full_children = 0;
 440                tn->empty_children = 1<<bits;
 441        }
 442
 443        pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
 444                 (unsigned long) (sizeof(struct node) << bits));
 445        return tn;
 446}
 447
 448/*
 449 * Check whether a tnode 'n' is "full", i.e. it is an internal node
 450 * and no bits are skipped. See discussion in dyntree paper p. 6
 451 */
 452
 453static inline int tnode_full(const struct tnode *tn, const struct node *n)
 454{
 455        if (n == NULL || IS_LEAF(n))
 456                return 0;
 457
 458        return ((struct tnode *) n)->pos == tn->pos + tn->bits;
 459}
 460
 461static inline void put_child(struct trie *t, struct tnode *tn, int i,
 462                             struct node *n)
 463{
 464        tnode_put_child_reorg(tn, i, n, -1);
 465}
 466
 467 /*
 468  * Add a child at position i overwriting the old value.
 469  * Update the value of full_children and empty_children.
 470  */
 471
 472static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
 473                                  int wasfull)
 474{
 475        struct node *chi = tn->child[i];
 476        int isfull;
 477
 478        BUG_ON(i >= 1<<tn->bits);
 479
 480        /* update emptyChildren */
 481        if (n == NULL && chi != NULL)
 482                tn->empty_children++;
 483        else if (n != NULL && chi == NULL)
 484                tn->empty_children--;
 485
 486        /* update fullChildren */
 487        if (wasfull == -1)
 488                wasfull = tnode_full(tn, chi);
 489
 490        isfull = tnode_full(tn, n);
 491        if (wasfull && !isfull)
 492                tn->full_children--;
 493        else if (!wasfull && isfull)
 494                tn->full_children++;
 495
 496        if (n)
 497                node_set_parent(n, tn);
 498
 499        rcu_assign_pointer(tn->child[i], n);
 500}
 501
 502static struct node *resize(struct trie *t, struct tnode *tn)
 503{
 504        int i;
 505        int err = 0;
 506        struct tnode *old_tn;
 507        int inflate_threshold_use;
 508        int halve_threshold_use;
 509        int max_resize;
 510
 511        if (!tn)
 512                return NULL;
 513
 514        pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
 515                 tn, inflate_threshold, halve_threshold);
 516
 517        /* No children */
 518        if (tn->empty_children == tnode_child_length(tn)) {
 519                tnode_free_safe(tn);
 520                return NULL;
 521        }
 522        /* One child */
 523        if (tn->empty_children == tnode_child_length(tn) - 1)
 524                for (i = 0; i < tnode_child_length(tn); i++) {
 525                        struct node *n;
 526
 527                        n = tn->child[i];
 528                        if (!n)
 529                                continue;
 530
 531                        /* compress one level */
 532                        node_set_parent(n, NULL);
 533                        tnode_free_safe(tn);
 534                        return n;
 535                }
 536        /*
 537         * Double as long as the resulting node has a number of
 538         * nonempty nodes that are above the threshold.
 539         */
 540
 541        /*
 542         * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
 543         * the Helsinki University of Technology and Matti Tikkanen of Nokia
 544         * Telecommunications, page 6:
 545         * "A node is doubled if the ratio of non-empty children to all
 546         * children in the *doubled* node is at least 'high'."
 547         *
 548         * 'high' in this instance is the variable 'inflate_threshold'. It
 549         * is expressed as a percentage, so we multiply it with
 550         * tnode_child_length() and instead of multiplying by 2 (since the
 551         * child array will be doubled by inflate()) and multiplying
 552         * the left-hand side by 100 (to handle the percentage thing) we
 553         * multiply the left-hand side by 50.
 554         *
 555         * The left-hand side may look a bit weird: tnode_child_length(tn)
 556         * - tn->empty_children is of course the number of non-null children
 557         * in the current node. tn->full_children is the number of "full"
 558         * children, that is non-null tnodes with a skip value of 0.
 559         * All of those will be doubled in the resulting inflated tnode, so
 560         * we just count them one extra time here.
 561         *
 562         * A clearer way to write this would be:
 563         *
 564         * to_be_doubled = tn->full_children;
 565         * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
 566         *     tn->full_children;
 567         *
 568         * new_child_length = tnode_child_length(tn) * 2;
 569         *
 570         * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
 571         *      new_child_length;
 572         * if (new_fill_factor >= inflate_threshold)
 573         *
 574         * ...and so on, tho it would mess up the while () loop.
 575         *
 576         * anyway,
 577         * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
 578         *      inflate_threshold
 579         *
 580         * avoid a division:
 581         * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
 582         *      inflate_threshold * new_child_length
 583         *
 584         * expand not_to_be_doubled and to_be_doubled, and shorten:
 585         * 100 * (tnode_child_length(tn) - tn->empty_children +
 586         *    tn->full_children) >= inflate_threshold * new_child_length
 587         *
 588         * expand new_child_length:
 589         * 100 * (tnode_child_length(tn) - tn->empty_children +
 590         *    tn->full_children) >=
 591         *      inflate_threshold * tnode_child_length(tn) * 2
 592         *
 593         * shorten again:
 594         * 50 * (tn->full_children + tnode_child_length(tn) -
 595         *    tn->empty_children) >= inflate_threshold *
 596         *    tnode_child_length(tn)
 597         *
 598         */
 599
 600        check_tnode(tn);
 601
 602        /* Keep root node larger  */
 603
 604        if (!tn->parent)
 605                inflate_threshold_use = inflate_threshold_root;
 606        else
 607                inflate_threshold_use = inflate_threshold;
 608
 609        err = 0;
 610        max_resize = 10;
 611        while ((tn->full_children > 0 &&  max_resize-- &&
 612                50 * (tn->full_children + tnode_child_length(tn)
 613                      - tn->empty_children)
 614                >= inflate_threshold_use * tnode_child_length(tn))) {
 615
 616                old_tn = tn;
 617                tn = inflate(t, tn);
 618
 619                if (IS_ERR(tn)) {
 620                        tn = old_tn;
 621#ifdef CONFIG_IP_FIB_TRIE_STATS
 622                        t->stats.resize_node_skipped++;
 623#endif
 624                        break;
 625                }
 626        }
 627
 628        if (max_resize < 0) {
 629                if (!tn->parent)
 630                        pr_warning("Fix inflate_threshold_root."
 631                                   " Now=%d size=%d bits\n",
 632                                   inflate_threshold_root, tn->bits);
 633                else
 634                        pr_warning("Fix inflate_threshold."
 635                                   " Now=%d size=%d bits\n",
 636                                   inflate_threshold, tn->bits);
 637        }
 638
 639        check_tnode(tn);
 640
 641        /*
 642         * Halve as long as the number of empty children in this
 643         * node is above threshold.
 644         */
 645
 646
 647        /* Keep root node larger  */
 648
 649        if (!tn->parent)
 650                halve_threshold_use = halve_threshold_root;
 651        else
 652                halve_threshold_use = halve_threshold;
 653
 654        err = 0;
 655        max_resize = 10;
 656        while (tn->bits > 1 &&  max_resize-- &&
 657               100 * (tnode_child_length(tn) - tn->empty_children) <
 658               halve_threshold_use * tnode_child_length(tn)) {
 659
 660                old_tn = tn;
 661                tn = halve(t, tn);
 662                if (IS_ERR(tn)) {
 663                        tn = old_tn;
 664#ifdef CONFIG_IP_FIB_TRIE_STATS
 665                        t->stats.resize_node_skipped++;
 666#endif
 667                        break;
 668                }
 669        }
 670
 671        if (max_resize < 0) {
 672                if (!tn->parent)
 673                        pr_warning("Fix halve_threshold_root."
 674                                   " Now=%d size=%d bits\n",
 675                                   halve_threshold_root, tn->bits);
 676                else
 677                        pr_warning("Fix halve_threshold."
 678                                   " Now=%d size=%d bits\n",
 679                                   halve_threshold, tn->bits);
 680        }
 681
 682        /* Only one child remains */
 683        if (tn->empty_children == tnode_child_length(tn) - 1)
 684                for (i = 0; i < tnode_child_length(tn); i++) {
 685                        struct node *n;
 686
 687                        n = tn->child[i];
 688                        if (!n)
 689                                continue;
 690
 691                        /* compress one level */
 692
 693                        node_set_parent(n, NULL);
 694                        tnode_free_safe(tn);
 695                        return n;
 696                }
 697
 698        return (struct node *) tn;
 699}
 700
 701static struct tnode *inflate(struct trie *t, struct tnode *tn)
 702{
 703        struct tnode *oldtnode = tn;
 704        int olen = tnode_child_length(tn);
 705        int i;
 706
 707        pr_debug("In inflate\n");
 708
 709        tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
 710
 711        if (!tn)
 712                return ERR_PTR(-ENOMEM);
 713
 714        /*
 715         * Preallocate and store tnodes before the actual work so we
 716         * don't get into an inconsistent state if memory allocation
 717         * fails. In case of failure we return the oldnode and  inflate
 718         * of tnode is ignored.
 719         */
 720
 721        for (i = 0; i < olen; i++) {
 722                struct tnode *inode;
 723
 724                inode = (struct tnode *) tnode_get_child(oldtnode, i);
 725                if (inode &&
 726                    IS_TNODE(inode) &&
 727                    inode->pos == oldtnode->pos + oldtnode->bits &&
 728                    inode->bits > 1) {
 729                        struct tnode *left, *right;
 730                        t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
 731
 732                        left = tnode_new(inode->key&(~m), inode->pos + 1,
 733                                         inode->bits - 1);
 734                        if (!left)
 735                                goto nomem;
 736
 737                        right = tnode_new(inode->key|m, inode->pos + 1,
 738                                          inode->bits - 1);
 739
 740                        if (!right) {
 741                                tnode_free(left);
 742                                goto nomem;
 743                        }
 744
 745                        put_child(t, tn, 2*i, (struct node *) left);
 746                        put_child(t, tn, 2*i+1, (struct node *) right);
 747                }
 748        }
 749
 750        for (i = 0; i < olen; i++) {
 751                struct tnode *inode;
 752                struct node *node = tnode_get_child(oldtnode, i);
 753                struct tnode *left, *right;
 754                int size, j;
 755
 756                /* An empty child */
 757                if (node == NULL)
 758                        continue;
 759
 760                /* A leaf or an internal node with skipped bits */
 761
 762                if (IS_LEAF(node) || ((struct tnode *) node)->pos >
 763                   tn->pos + tn->bits - 1) {
 764                        if (tkey_extract_bits(node->key,
 765                                              oldtnode->pos + oldtnode->bits,
 766                                              1) == 0)
 767                                put_child(t, tn, 2*i, node);
 768                        else
 769                                put_child(t, tn, 2*i+1, node);
 770                        continue;
 771                }
 772
 773                /* An internal node with two children */
 774                inode = (struct tnode *) node;
 775
 776                if (inode->bits == 1) {
 777                        put_child(t, tn, 2*i, inode->child[0]);
 778                        put_child(t, tn, 2*i+1, inode->child[1]);
 779
 780                        tnode_free_safe(inode);
 781                        continue;
 782                }
 783
 784                /* An internal node with more than two children */
 785
 786                /* We will replace this node 'inode' with two new
 787                 * ones, 'left' and 'right', each with half of the
 788                 * original children. The two new nodes will have
 789                 * a position one bit further down the key and this
 790                 * means that the "significant" part of their keys
 791                 * (see the discussion near the top of this file)
 792                 * will differ by one bit, which will be "0" in
 793                 * left's key and "1" in right's key. Since we are
 794                 * moving the key position by one step, the bit that
 795                 * we are moving away from - the bit at position
 796                 * (inode->pos) - is the one that will differ between
 797                 * left and right. So... we synthesize that bit in the
 798                 * two  new keys.
 799                 * The mask 'm' below will be a single "one" bit at
 800                 * the position (inode->pos)
 801                 */
 802
 803                /* Use the old key, but set the new significant
 804                 *   bit to zero.
 805                 */
 806
 807                left = (struct tnode *) tnode_get_child(tn, 2*i);
 808                put_child(t, tn, 2*i, NULL);
 809
 810                BUG_ON(!left);
 811
 812                right = (struct tnode *) tnode_get_child(tn, 2*i+1);
 813                put_child(t, tn, 2*i+1, NULL);
 814
 815                BUG_ON(!right);
 816
 817                size = tnode_child_length(left);
 818                for (j = 0; j < size; j++) {
 819                        put_child(t, left, j, inode->child[j]);
 820                        put_child(t, right, j, inode->child[j + size]);
 821                }
 822                put_child(t, tn, 2*i, resize(t, left));
 823                put_child(t, tn, 2*i+1, resize(t, right));
 824
 825                tnode_free_safe(inode);
 826        }
 827        tnode_free_safe(oldtnode);
 828        return tn;
 829nomem:
 830        {
 831                int size = tnode_child_length(tn);
 832                int j;
 833
 834                for (j = 0; j < size; j++)
 835                        if (tn->child[j])
 836                                tnode_free((struct tnode *)tn->child[j]);
 837
 838                tnode_free(tn);
 839
 840                return ERR_PTR(-ENOMEM);
 841        }
 842}
 843
 844static struct tnode *halve(struct trie *t, struct tnode *tn)
 845{
 846        struct tnode *oldtnode = tn;
 847        struct node *left, *right;
 848        int i;
 849        int olen = tnode_child_length(tn);
 850
 851        pr_debug("In halve\n");
 852
 853        tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
 854
 855        if (!tn)
 856                return ERR_PTR(-ENOMEM);
 857
 858        /*
 859         * Preallocate and store tnodes before the actual work so we
 860         * don't get into an inconsistent state if memory allocation
 861         * fails. In case of failure we return the oldnode and halve
 862         * of tnode is ignored.
 863         */
 864
 865        for (i = 0; i < olen; i += 2) {
 866                left = tnode_get_child(oldtnode, i);
 867                right = tnode_get_child(oldtnode, i+1);
 868
 869                /* Two nonempty children */
 870                if (left && right) {
 871                        struct tnode *newn;
 872
 873                        newn = tnode_new(left->key, tn->pos + tn->bits, 1);
 874
 875                        if (!newn)
 876                                goto nomem;
 877
 878                        put_child(t, tn, i/2, (struct node *)newn);
 879                }
 880
 881        }
 882
 883        for (i = 0; i < olen; i += 2) {
 884                struct tnode *newBinNode;
 885
 886                left = tnode_get_child(oldtnode, i);
 887                right = tnode_get_child(oldtnode, i+1);
 888
 889                /* At least one of the children is empty */
 890                if (left == NULL) {
 891                        if (right == NULL)    /* Both are empty */
 892                                continue;
 893                        put_child(t, tn, i/2, right);
 894                        continue;
 895                }
 896
 897                if (right == NULL) {
 898                        put_child(t, tn, i/2, left);
 899                        continue;
 900                }
 901
 902                /* Two nonempty children */
 903                newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
 904                put_child(t, tn, i/2, NULL);
 905                put_child(t, newBinNode, 0, left);
 906                put_child(t, newBinNode, 1, right);
 907                put_child(t, tn, i/2, resize(t, newBinNode));
 908        }
 909        tnode_free_safe(oldtnode);
 910        return tn;
 911nomem:
 912        {
 913                int size = tnode_child_length(tn);
 914                int j;
 915
 916                for (j = 0; j < size; j++)
 917                        if (tn->child[j])
 918                                tnode_free((struct tnode *)tn->child[j]);
 919
 920                tnode_free(tn);
 921
 922                return ERR_PTR(-ENOMEM);
 923        }
 924}
 925
 926/* readside must use rcu_read_lock currently dump routines
 927 via get_fa_head and dump */
 928
 929static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
 930{
 931        struct hlist_head *head = &l->list;
 932        struct hlist_node *node;
 933        struct leaf_info *li;
 934
 935        hlist_for_each_entry_rcu(li, node, head, hlist)
 936                if (li->plen == plen)
 937                        return li;
 938
 939        return NULL;
 940}
 941
 942static inline struct list_head *get_fa_head(struct leaf *l, int plen)
 943{
 944        struct leaf_info *li = find_leaf_info(l, plen);
 945
 946        if (!li)
 947                return NULL;
 948
 949        return &li->falh;
 950}
 951
 952static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
 953{
 954        struct leaf_info *li = NULL, *last = NULL;
 955        struct hlist_node *node;
 956
 957        if (hlist_empty(head)) {
 958                hlist_add_head_rcu(&new->hlist, head);
 959        } else {
 960                hlist_for_each_entry(li, node, head, hlist) {
 961                        if (new->plen > li->plen)
 962                                break;
 963
 964                        last = li;
 965                }
 966                if (last)
 967                        hlist_add_after_rcu(&last->hlist, &new->hlist);
 968                else
 969                        hlist_add_before_rcu(&new->hlist, &li->hlist);
 970        }
 971}
 972
 973/* rcu_read_lock needs to be hold by caller from readside */
 974
 975static struct leaf *
 976fib_find_node(struct trie *t, u32 key)
 977{
 978        int pos;
 979        struct tnode *tn;
 980        struct node *n;
 981
 982        pos = 0;
 983        n = rcu_dereference(t->trie);
 984
 985        while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
 986                tn = (struct tnode *) n;
 987
 988                check_tnode(tn);
 989
 990                if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
 991                        pos = tn->pos + tn->bits;
 992                        n = tnode_get_child_rcu(tn,
 993                                                tkey_extract_bits(key,
 994                                                                  tn->pos,
 995                                                                  tn->bits));
 996                } else
 997                        break;
 998        }
 999        /* Case we have found a leaf. Compare prefixes */
1000
1001        if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
1002                return (struct leaf *)n;
1003
1004        return NULL;
1005}
1006
1007static void trie_rebalance(struct trie *t, struct tnode *tn)
1008{
1009        int wasfull;
1010        t_key cindex, key;
1011        struct tnode *tp;
1012
1013        key = tn->key;
1014
1015        while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
1016                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1017                wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1018                tn = (struct tnode *) resize(t, (struct tnode *)tn);
1019
1020                tnode_put_child_reorg((struct tnode *)tp, cindex,
1021                                      (struct node *)tn, wasfull);
1022
1023                tp = node_parent((struct node *) tn);
1024                if (!tp)
1025                        rcu_assign_pointer(t->trie, (struct node *)tn);
1026
1027                tnode_free_flush();
1028                if (!tp)
1029                        break;
1030                tn = tp;
1031        }
1032
1033        /* Handle last (top) tnode */
1034        if (IS_TNODE(tn))
1035                tn = (struct tnode *)resize(t, (struct tnode *)tn);
1036
1037        rcu_assign_pointer(t->trie, (struct node *)tn);
1038        tnode_free_flush();
1039
1040        return;
1041}
1042
1043/* only used from updater-side */
1044
1045static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1046{
1047        int pos, newpos;
1048        struct tnode *tp = NULL, *tn = NULL;
1049        struct node *n;
1050        struct leaf *l;
1051        int missbit;
1052        struct list_head *fa_head = NULL;
1053        struct leaf_info *li;
1054        t_key cindex;
1055
1056        pos = 0;
1057        n = t->trie;
1058
1059        /* If we point to NULL, stop. Either the tree is empty and we should
1060         * just put a new leaf in if, or we have reached an empty child slot,
1061         * and we should just put our new leaf in that.
1062         * If we point to a T_TNODE, check if it matches our key. Note that
1063         * a T_TNODE might be skipping any number of bits - its 'pos' need
1064         * not be the parent's 'pos'+'bits'!
1065         *
1066         * If it does match the current key, get pos/bits from it, extract
1067         * the index from our key, push the T_TNODE and walk the tree.
1068         *
1069         * If it doesn't, we have to replace it with a new T_TNODE.
1070         *
1071         * If we point to a T_LEAF, it might or might not have the same key
1072         * as we do. If it does, just change the value, update the T_LEAF's
1073         * value, and return it.
1074         * If it doesn't, we need to replace it with a T_TNODE.
1075         */
1076
1077        while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1078                tn = (struct tnode *) n;
1079
1080                check_tnode(tn);
1081
1082                if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1083                        tp = tn;
1084                        pos = tn->pos + tn->bits;
1085                        n = tnode_get_child(tn,
1086                                            tkey_extract_bits(key,
1087                                                              tn->pos,
1088                                                              tn->bits));
1089
1090                        BUG_ON(n && node_parent(n) != tn);
1091                } else
1092                        break;
1093        }
1094
1095        /*
1096         * n  ----> NULL, LEAF or TNODE
1097         *
1098         * tp is n's (parent) ----> NULL or TNODE
1099         */
1100
1101        BUG_ON(tp && IS_LEAF(tp));
1102
1103        /* Case 1: n is a leaf. Compare prefixes */
1104
1105        if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1106                l = (struct leaf *) n;
1107                li = leaf_info_new(plen);
1108
1109                if (!li)
1110                        return NULL;
1111
1112                fa_head = &li->falh;
1113                insert_leaf_info(&l->list, li);
1114                goto done;
1115        }
1116        l = leaf_new();
1117
1118        if (!l)
1119                return NULL;
1120
1121        l->key = key;
1122        li = leaf_info_new(plen);
1123
1124        if (!li) {
1125                free_leaf(l);
1126                return NULL;
1127        }
1128
1129        fa_head = &li->falh;
1130        insert_leaf_info(&l->list, li);
1131
1132        if (t->trie && n == NULL) {
1133                /* Case 2: n is NULL, and will just insert a new leaf */
1134
1135                node_set_parent((struct node *)l, tp);
1136
1137                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1138                put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1139        } else {
1140                /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1141                /*
1142                 *  Add a new tnode here
1143                 *  first tnode need some special handling
1144                 */
1145
1146                if (tp)
1147                        pos = tp->pos+tp->bits;
1148                else
1149                        pos = 0;
1150
1151                if (n) {
1152                        newpos = tkey_mismatch(key, pos, n->key);
1153                        tn = tnode_new(n->key, newpos, 1);
1154                } else {
1155                        newpos = 0;
1156                        tn = tnode_new(key, newpos, 1); /* First tnode */
1157                }
1158
1159                if (!tn) {
1160                        free_leaf_info(li);
1161                        free_leaf(l);
1162                        return NULL;
1163                }
1164
1165                node_set_parent((struct node *)tn, tp);
1166
1167                missbit = tkey_extract_bits(key, newpos, 1);
1168                put_child(t, tn, missbit, (struct node *)l);
1169                put_child(t, tn, 1-missbit, n);
1170
1171                if (tp) {
1172                        cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1173                        put_child(t, (struct tnode *)tp, cindex,
1174                                  (struct node *)tn);
1175                } else {
1176                        rcu_assign_pointer(t->trie, (struct node *)tn);
1177                        tp = tn;
1178                }
1179        }
1180
1181        if (tp && tp->pos + tp->bits > 32)
1182                pr_warning("fib_trie"
1183                           " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1184                           tp, tp->pos, tp->bits, key, plen);
1185
1186        /* Rebalance the trie */
1187
1188        trie_rebalance(t, tp);
1189done:
1190        return fa_head;
1191}
1192
1193/*
1194 * Caller must hold RTNL.
1195 */
1196static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1197{
1198        struct trie *t = (struct trie *) tb->tb_data;
1199        struct fib_alias *fa, *new_fa;
1200        struct list_head *fa_head = NULL;
1201        struct fib_info *fi;
1202        int plen = cfg->fc_dst_len;
1203        u8 tos = cfg->fc_tos;
1204        u32 key, mask;
1205        int err;
1206        struct leaf *l;
1207
1208        if (plen > 32)
1209                return -EINVAL;
1210
1211        key = ntohl(cfg->fc_dst);
1212
1213        pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1214
1215        mask = ntohl(inet_make_mask(plen));
1216
1217        if (key & ~mask)
1218                return -EINVAL;
1219
1220        key = key & mask;
1221
1222        fi = fib_create_info(cfg);
1223        if (IS_ERR(fi)) {
1224                err = PTR_ERR(fi);
1225                goto err;
1226        }
1227
1228        l = fib_find_node(t, key);
1229        fa = NULL;
1230
1231        if (l) {
1232                fa_head = get_fa_head(l, plen);
1233                fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1234        }
1235
1236        /* Now fa, if non-NULL, points to the first fib alias
1237         * with the same keys [prefix,tos,priority], if such key already
1238         * exists or to the node before which we will insert new one.
1239         *
1240         * If fa is NULL, we will need to allocate a new one and
1241         * insert to the head of f.
1242         *
1243         * If f is NULL, no fib node matched the destination key
1244         * and we need to allocate a new one of those as well.
1245         */
1246
1247        if (fa && fa->fa_tos == tos &&
1248            fa->fa_info->fib_priority == fi->fib_priority) {
1249                struct fib_alias *fa_first, *fa_match;
1250
1251                err = -EEXIST;
1252                if (cfg->fc_nlflags & NLM_F_EXCL)
1253                        goto out;
1254
1255                /* We have 2 goals:
1256                 * 1. Find exact match for type, scope, fib_info to avoid
1257                 * duplicate routes
1258                 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1259                 */
1260                fa_match = NULL;
1261                fa_first = fa;
1262                fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1263                list_for_each_entry_continue(fa, fa_head, fa_list) {
1264                        if (fa->fa_tos != tos)
1265                                break;
1266                        if (fa->fa_info->fib_priority != fi->fib_priority)
1267                                break;
1268                        if (fa->fa_type == cfg->fc_type &&
1269                            fa->fa_scope == cfg->fc_scope &&
1270                            fa->fa_info == fi) {
1271                                fa_match = fa;
1272                                break;
1273                        }
1274                }
1275
1276                if (cfg->fc_nlflags & NLM_F_REPLACE) {
1277                        struct fib_info *fi_drop;
1278                        u8 state;
1279
1280                        fa = fa_first;
1281                        if (fa_match) {
1282                                if (fa == fa_match)
1283                                        err = 0;
1284                                goto out;
1285                        }
1286                        err = -ENOBUFS;
1287                        new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1288                        if (new_fa == NULL)
1289                                goto out;
1290
1291                        fi_drop = fa->fa_info;
1292                        new_fa->fa_tos = fa->fa_tos;
1293                        new_fa->fa_info = fi;
1294                        new_fa->fa_type = cfg->fc_type;
1295                        new_fa->fa_scope = cfg->fc_scope;
1296                        state = fa->fa_state;
1297                        new_fa->fa_state = state & ~FA_S_ACCESSED;
1298
1299                        list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1300                        alias_free_mem_rcu(fa);
1301
1302                        fib_release_info(fi_drop);
1303                        if (state & FA_S_ACCESSED)
1304                                rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1305                        rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1306                                tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1307
1308                        goto succeeded;
1309                }
1310                /* Error if we find a perfect match which
1311                 * uses the same scope, type, and nexthop
1312                 * information.
1313                 */
1314                if (fa_match)
1315                        goto out;
1316
1317                if (!(cfg->fc_nlflags & NLM_F_APPEND))
1318                        fa = fa_first;
1319        }
1320        err = -ENOENT;
1321        if (!(cfg->fc_nlflags & NLM_F_CREATE))
1322                goto out;
1323
1324        err = -ENOBUFS;
1325        new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1326        if (new_fa == NULL)
1327                goto out;
1328
1329        new_fa->fa_info = fi;
1330        new_fa->fa_tos = tos;
1331        new_fa->fa_type = cfg->fc_type;
1332        new_fa->fa_scope = cfg->fc_scope;
1333        new_fa->fa_state = 0;
1334        /*
1335         * Insert new entry to the list.
1336         */
1337
1338        if (!fa_head) {
1339                fa_head = fib_insert_node(t, key, plen);
1340                if (unlikely(!fa_head)) {
1341                        err = -ENOMEM;
1342                        goto out_free_new_fa;
1343                }
1344        }
1345
1346        list_add_tail_rcu(&new_fa->fa_list,
1347                          (fa ? &fa->fa_list : fa_head));
1348
1349        rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1350        rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1351                  &cfg->fc_nlinfo, 0);
1352succeeded:
1353        return 0;
1354
1355out_free_new_fa:
1356        kmem_cache_free(fn_alias_kmem, new_fa);
1357out:
1358        fib_release_info(fi);
1359err:
1360        return err;
1361}
1362
1363/* should be called with rcu_read_lock */
1364static int check_leaf(struct trie *t, struct leaf *l,
1365                      t_key key,  const struct flowi *flp,
1366                      struct fib_result *res)
1367{
1368        struct leaf_info *li;
1369        struct hlist_head *hhead = &l->list;
1370        struct hlist_node *node;
1371
1372        hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1373                int err;
1374                int plen = li->plen;
1375                __be32 mask = inet_make_mask(plen);
1376
1377                if (l->key != (key & ntohl(mask)))
1378                        continue;
1379
1380                err = fib_semantic_match(&li->falh, flp, res, plen);
1381
1382#ifdef CONFIG_IP_FIB_TRIE_STATS
1383                if (err <= 0)
1384                        t->stats.semantic_match_passed++;
1385                else
1386                        t->stats.semantic_match_miss++;
1387#endif
1388                if (err <= 0)
1389                        return err;
1390        }
1391
1392        return 1;
1393}
1394
1395static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1396                          struct fib_result *res)
1397{
1398        struct trie *t = (struct trie *) tb->tb_data;
1399        int ret;
1400        struct node *n;
1401        struct tnode *pn;
1402        int pos, bits;
1403        t_key key = ntohl(flp->fl4_dst);
1404        int chopped_off;
1405        t_key cindex = 0;
1406        int current_prefix_length = KEYLENGTH;
1407        struct tnode *cn;
1408        t_key node_prefix, key_prefix, pref_mismatch;
1409        int mp;
1410
1411        rcu_read_lock();
1412
1413        n = rcu_dereference(t->trie);
1414        if (!n)
1415                goto failed;
1416
1417#ifdef CONFIG_IP_FIB_TRIE_STATS
1418        t->stats.gets++;
1419#endif
1420
1421        /* Just a leaf? */
1422        if (IS_LEAF(n)) {
1423                ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1424                goto found;
1425        }
1426
1427        pn = (struct tnode *) n;
1428        chopped_off = 0;
1429
1430        while (pn) {
1431                pos = pn->pos;
1432                bits = pn->bits;
1433
1434                if (!chopped_off)
1435                        cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1436                                                   pos, bits);
1437
1438                n = tnode_get_child(pn, cindex);
1439
1440                if (n == NULL) {
1441#ifdef CONFIG_IP_FIB_TRIE_STATS
1442                        t->stats.null_node_hit++;
1443#endif
1444                        goto backtrace;
1445                }
1446
1447                if (IS_LEAF(n)) {
1448                        ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1449                        if (ret > 0)
1450                                goto backtrace;
1451                        goto found;
1452                }
1453
1454                cn = (struct tnode *)n;
1455
1456                /*
1457                 * It's a tnode, and we can do some extra checks here if we
1458                 * like, to avoid descending into a dead-end branch.
1459                 * This tnode is in the parent's child array at index
1460                 * key[p_pos..p_pos+p_bits] but potentially with some bits
1461                 * chopped off, so in reality the index may be just a
1462                 * subprefix, padded with zero at the end.
1463                 * We can also take a look at any skipped bits in this
1464                 * tnode - everything up to p_pos is supposed to be ok,
1465                 * and the non-chopped bits of the index (se previous
1466                 * paragraph) are also guaranteed ok, but the rest is
1467                 * considered unknown.
1468                 *
1469                 * The skipped bits are key[pos+bits..cn->pos].
1470                 */
1471
1472                /* If current_prefix_length < pos+bits, we are already doing
1473                 * actual prefix  matching, which means everything from
1474                 * pos+(bits-chopped_off) onward must be zero along some
1475                 * branch of this subtree - otherwise there is *no* valid
1476                 * prefix present. Here we can only check the skipped
1477                 * bits. Remember, since we have already indexed into the
1478                 * parent's child array, we know that the bits we chopped of
1479                 * *are* zero.
1480                 */
1481
1482                /* NOTA BENE: Checking only skipped bits
1483                   for the new node here */
1484
1485                if (current_prefix_length < pos+bits) {
1486                        if (tkey_extract_bits(cn->key, current_prefix_length,
1487                                                cn->pos - current_prefix_length)
1488                            || !(cn->child[0]))
1489                                goto backtrace;
1490                }
1491
1492                /*
1493                 * If chopped_off=0, the index is fully validated and we
1494                 * only need to look at the skipped bits for this, the new,
1495                 * tnode. What we actually want to do is to find out if
1496                 * these skipped bits match our key perfectly, or if we will
1497                 * have to count on finding a matching prefix further down,
1498                 * because if we do, we would like to have some way of
1499                 * verifying the existence of such a prefix at this point.
1500                 */
1501
1502                /* The only thing we can do at this point is to verify that
1503                 * any such matching prefix can indeed be a prefix to our
1504                 * key, and if the bits in the node we are inspecting that
1505                 * do not match our key are not ZERO, this cannot be true.
1506                 * Thus, find out where there is a mismatch (before cn->pos)
1507                 * and verify that all the mismatching bits are zero in the
1508                 * new tnode's key.
1509                 */
1510
1511                /*
1512                 * Note: We aren't very concerned about the piece of
1513                 * the key that precede pn->pos+pn->bits, since these
1514                 * have already been checked. The bits after cn->pos
1515                 * aren't checked since these are by definition
1516                 * "unknown" at this point. Thus, what we want to see
1517                 * is if we are about to enter the "prefix matching"
1518                 * state, and in that case verify that the skipped
1519                 * bits that will prevail throughout this subtree are
1520                 * zero, as they have to be if we are to find a
1521                 * matching prefix.
1522                 */
1523
1524                node_prefix = mask_pfx(cn->key, cn->pos);
1525                key_prefix = mask_pfx(key, cn->pos);
1526                pref_mismatch = key_prefix^node_prefix;
1527                mp = 0;
1528
1529                /*
1530                 * In short: If skipped bits in this node do not match
1531                 * the search key, enter the "prefix matching"
1532                 * state.directly.
1533                 */
1534                if (pref_mismatch) {
1535                        while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1536                                mp++;
1537                                pref_mismatch = pref_mismatch << 1;
1538                        }
1539                        key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1540
1541                        if (key_prefix != 0)
1542                                goto backtrace;
1543
1544                        if (current_prefix_length >= cn->pos)
1545                                current_prefix_length = mp;
1546                }
1547
1548                pn = (struct tnode *)n; /* Descend */
1549                chopped_off = 0;
1550                continue;
1551
1552backtrace:
1553                chopped_off++;
1554
1555                /* As zero don't change the child key (cindex) */
1556                while ((chopped_off <= pn->bits)
1557                       && !(cindex & (1<<(chopped_off-1))))
1558                        chopped_off++;
1559
1560                /* Decrease current_... with bits chopped off */
1561                if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1562                        current_prefix_length = pn->pos + pn->bits
1563                                - chopped_off;
1564
1565                /*
1566                 * Either we do the actual chop off according or if we have
1567                 * chopped off all bits in this tnode walk up to our parent.
1568                 */
1569
1570                if (chopped_off <= pn->bits) {
1571                        cindex &= ~(1 << (chopped_off-1));
1572                } else {
1573                        struct tnode *parent = node_parent((struct node *) pn);
1574                        if (!parent)
1575                                goto failed;
1576
1577                        /* Get Child's index */
1578                        cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1579                        pn = parent;
1580                        chopped_off = 0;
1581
1582#ifdef CONFIG_IP_FIB_TRIE_STATS
1583                        t->stats.backtrack++;
1584#endif
1585                        goto backtrace;
1586                }
1587        }
1588failed:
1589        ret = 1;
1590found:
1591        rcu_read_unlock();
1592        return ret;
1593}
1594
1595/*
1596 * Remove the leaf and return parent.
1597 */
1598static void trie_leaf_remove(struct trie *t, struct leaf *l)
1599{
1600        struct tnode *tp = node_parent((struct node *) l);
1601
1602        pr_debug("entering trie_leaf_remove(%p)\n", l);
1603
1604        if (tp) {
1605                t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1606                put_child(t, (struct tnode *)tp, cindex, NULL);
1607                trie_rebalance(t, tp);
1608        } else
1609                rcu_assign_pointer(t->trie, NULL);
1610
1611        free_leaf(l);
1612}
1613
1614/*
1615 * Caller must hold RTNL.
1616 */
1617static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1618{
1619        struct trie *t = (struct trie *) tb->tb_data;
1620        u32 key, mask;
1621        int plen = cfg->fc_dst_len;
1622        u8 tos = cfg->fc_tos;
1623        struct fib_alias *fa, *fa_to_delete;
1624        struct list_head *fa_head;
1625        struct leaf *l;
1626        struct leaf_info *li;
1627
1628        if (plen > 32)
1629                return -EINVAL;
1630
1631        key = ntohl(cfg->fc_dst);
1632        mask = ntohl(inet_make_mask(plen));
1633
1634        if (key & ~mask)
1635                return -EINVAL;
1636
1637        key = key & mask;
1638        l = fib_find_node(t, key);
1639
1640        if (!l)
1641                return -ESRCH;
1642
1643        fa_head = get_fa_head(l, plen);
1644        fa = fib_find_alias(fa_head, tos, 0);
1645
1646        if (!fa)
1647                return -ESRCH;
1648
1649        pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1650
1651        fa_to_delete = NULL;
1652        fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1653        list_for_each_entry_continue(fa, fa_head, fa_list) {
1654                struct fib_info *fi = fa->fa_info;
1655
1656                if (fa->fa_tos != tos)
1657                        break;
1658
1659                if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1660                    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1661                     fa->fa_scope == cfg->fc_scope) &&
1662                    (!cfg->fc_protocol ||
1663                     fi->fib_protocol == cfg->fc_protocol) &&
1664                    fib_nh_match(cfg, fi) == 0) {
1665                        fa_to_delete = fa;
1666                        break;
1667                }
1668        }
1669
1670        if (!fa_to_delete)
1671                return -ESRCH;
1672
1673        fa = fa_to_delete;
1674        rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1675                  &cfg->fc_nlinfo, 0);
1676
1677        l = fib_find_node(t, key);
1678        li = find_leaf_info(l, plen);
1679
1680        list_del_rcu(&fa->fa_list);
1681
1682        if (list_empty(fa_head)) {
1683                hlist_del_rcu(&li->hlist);
1684                free_leaf_info(li);
1685        }
1686
1687        if (hlist_empty(&l->list))
1688                trie_leaf_remove(t, l);
1689
1690        if (fa->fa_state & FA_S_ACCESSED)
1691                rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1692
1693        fib_release_info(fa->fa_info);
1694        alias_free_mem_rcu(fa);
1695        return 0;
1696}
1697
1698static int trie_flush_list(struct list_head *head)
1699{
1700        struct fib_alias *fa, *fa_node;
1701        int found = 0;
1702
1703        list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1704                struct fib_info *fi = fa->fa_info;
1705
1706                if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1707                        list_del_rcu(&fa->fa_list);
1708                        fib_release_info(fa->fa_info);
1709                        alias_free_mem_rcu(fa);
1710                        found++;
1711                }
1712        }
1713        return found;
1714}
1715
1716static int trie_flush_leaf(struct leaf *l)
1717{
1718        int found = 0;
1719        struct hlist_head *lih = &l->list;
1720        struct hlist_node *node, *tmp;
1721        struct leaf_info *li = NULL;
1722
1723        hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1724                found += trie_flush_list(&li->falh);
1725
1726                if (list_empty(&li->falh)) {
1727                        hlist_del_rcu(&li->hlist);
1728                        free_leaf_info(li);
1729                }
1730        }
1731        return found;
1732}
1733
1734/*
1735 * Scan for the next right leaf starting at node p->child[idx]
1736 * Since we have back pointer, no recursion necessary.
1737 */
1738static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1739{
1740        do {
1741                t_key idx;
1742
1743                if (c)
1744                        idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1745                else
1746                        idx = 0;
1747
1748                while (idx < 1u << p->bits) {
1749                        c = tnode_get_child_rcu(p, idx++);
1750                        if (!c)
1751                                continue;
1752
1753                        if (IS_LEAF(c)) {
1754                                prefetch(p->child[idx]);
1755                                return (struct leaf *) c;
1756                        }
1757
1758                        /* Rescan start scanning in new node */
1759                        p = (struct tnode *) c;
1760                        idx = 0;
1761                }
1762
1763                /* Node empty, walk back up to parent */
1764                c = (struct node *) p;
1765        } while ( (p = node_parent_rcu(c)) != NULL);
1766
1767        return NULL; /* Root of trie */
1768}
1769
1770static struct leaf *trie_firstleaf(struct trie *t)
1771{
1772        struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
1773
1774        if (!n)
1775                return NULL;
1776
1777        if (IS_LEAF(n))          /* trie is just a leaf */
1778                return (struct leaf *) n;
1779
1780        return leaf_walk_rcu(n, NULL);
1781}
1782
1783static struct leaf *trie_nextleaf(struct leaf *l)
1784{
1785        struct node *c = (struct node *) l;
1786        struct tnode *p = node_parent(c);
1787
1788        if (!p)
1789                return NULL;    /* trie with just one leaf */
1790
1791        return leaf_walk_rcu(p, c);
1792}
1793
1794static struct leaf *trie_leafindex(struct trie *t, int index)
1795{
1796        struct leaf *l = trie_firstleaf(t);
1797
1798        while (l && index-- > 0)
1799                l = trie_nextleaf(l);
1800
1801        return l;
1802}
1803
1804
1805/*
1806 * Caller must hold RTNL.
1807 */
1808static int fn_trie_flush(struct fib_table *tb)
1809{
1810        struct trie *t = (struct trie *) tb->tb_data;
1811        struct leaf *l, *ll = NULL;
1812        int found = 0;
1813
1814        for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1815                found += trie_flush_leaf(l);
1816
1817                if (ll && hlist_empty(&ll->list))
1818                        trie_leaf_remove(t, ll);
1819                ll = l;
1820        }
1821
1822        if (ll && hlist_empty(&ll->list))
1823                trie_leaf_remove(t, ll);
1824
1825        pr_debug("trie_flush found=%d\n", found);
1826        return found;
1827}
1828
1829static void fn_trie_select_default(struct fib_table *tb,
1830                                   const struct flowi *flp,
1831                                   struct fib_result *res)
1832{
1833        struct trie *t = (struct trie *) tb->tb_data;
1834        int order, last_idx;
1835        struct fib_info *fi = NULL;
1836        struct fib_info *last_resort;
1837        struct fib_alias *fa = NULL;
1838        struct list_head *fa_head;
1839        struct leaf *l;
1840
1841        last_idx = -1;
1842        last_resort = NULL;
1843        order = -1;
1844
1845        rcu_read_lock();
1846
1847        l = fib_find_node(t, 0);
1848        if (!l)
1849                goto out;
1850
1851        fa_head = get_fa_head(l, 0);
1852        if (!fa_head)
1853                goto out;
1854
1855        if (list_empty(fa_head))
1856                goto out;
1857
1858        list_for_each_entry_rcu(fa, fa_head, fa_list) {
1859                struct fib_info *next_fi = fa->fa_info;
1860
1861                if (fa->fa_scope != res->scope ||
1862                    fa->fa_type != RTN_UNICAST)
1863                        continue;
1864
1865                if (next_fi->fib_priority > res->fi->fib_priority)
1866                        break;
1867                if (!next_fi->fib_nh[0].nh_gw ||
1868                    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1869                        continue;
1870                fa->fa_state |= FA_S_ACCESSED;
1871
1872                if (fi == NULL) {
1873                        if (next_fi != res->fi)
1874                                break;
1875                } else if (!fib_detect_death(fi, order, &last_resort,
1876                                             &last_idx, tb->tb_default)) {
1877                        fib_result_assign(res, fi);
1878                        tb->tb_default = order;
1879                        goto out;
1880                }
1881                fi = next_fi;
1882                order++;
1883        }
1884        if (order <= 0 || fi == NULL) {
1885                tb->tb_default = -1;
1886                goto out;
1887        }
1888
1889        if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1890                                tb->tb_default)) {
1891                fib_result_assign(res, fi);
1892                tb->tb_default = order;
1893                goto out;
1894        }
1895        if (last_idx >= 0)
1896                fib_result_assign(res, last_resort);
1897        tb->tb_default = last_idx;
1898out:
1899        rcu_read_unlock();
1900}
1901
1902static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1903                           struct fib_table *tb,
1904                           struct sk_buff *skb, struct netlink_callback *cb)
1905{
1906        int i, s_i;
1907        struct fib_alias *fa;
1908        __be32 xkey = htonl(key);
1909
1910        s_i = cb->args[5];
1911        i = 0;
1912
1913        /* rcu_read_lock is hold by caller */
1914
1915        list_for_each_entry_rcu(fa, fah, fa_list) {
1916                if (i < s_i) {
1917                        i++;
1918                        continue;
1919                }
1920
1921                if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1922                                  cb->nlh->nlmsg_seq,
1923                                  RTM_NEWROUTE,
1924                                  tb->tb_id,
1925                                  fa->fa_type,
1926                                  fa->fa_scope,
1927                                  xkey,
1928                                  plen,
1929                                  fa->fa_tos,
1930                                  fa->fa_info, NLM_F_MULTI) < 0) {
1931                        cb->args[5] = i;
1932                        return -1;
1933                }
1934                i++;
1935        }
1936        cb->args[5] = i;
1937        return skb->len;
1938}
1939
1940static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1941                        struct sk_buff *skb, struct netlink_callback *cb)
1942{
1943        struct leaf_info *li;
1944        struct hlist_node *node;
1945        int i, s_i;
1946
1947        s_i = cb->args[4];
1948        i = 0;
1949
1950        /* rcu_read_lock is hold by caller */
1951        hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1952                if (i < s_i) {
1953                        i++;
1954                        continue;
1955                }
1956
1957                if (i > s_i)
1958                        cb->args[5] = 0;
1959
1960                if (list_empty(&li->falh))
1961                        continue;
1962
1963                if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1964                        cb->args[4] = i;
1965                        return -1;
1966                }
1967                i++;
1968        }
1969
1970        cb->args[4] = i;
1971        return skb->len;
1972}
1973
1974static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
1975                        struct netlink_callback *cb)
1976{
1977        struct leaf *l;
1978        struct trie *t = (struct trie *) tb->tb_data;
1979        t_key key = cb->args[2];
1980        int count = cb->args[3];
1981
1982        rcu_read_lock();
1983        /* Dump starting at last key.
1984         * Note: 0.0.0.0/0 (ie default) is first key.
1985         */
1986        if (count == 0)
1987                l = trie_firstleaf(t);
1988        else {
1989                /* Normally, continue from last key, but if that is missing
1990                 * fallback to using slow rescan
1991                 */
1992                l = fib_find_node(t, key);
1993                if (!l)
1994                        l = trie_leafindex(t, count);
1995        }
1996
1997        while (l) {
1998                cb->args[2] = l->key;
1999                if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
2000                        cb->args[3] = count;
2001                        rcu_read_unlock();
2002                        return -1;
2003                }
2004
2005                ++count;
2006                l = trie_nextleaf(l);
2007                memset(&cb->args[4], 0,
2008                       sizeof(cb->args) - 4*sizeof(cb->args[0]));
2009        }
2010        cb->args[3] = count;
2011        rcu_read_unlock();
2012
2013        return skb->len;
2014}
2015
2016void __init fib_hash_init(void)
2017{
2018        fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2019                                          sizeof(struct fib_alias),
2020                                          0, SLAB_PANIC, NULL);
2021
2022        trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
2023                                           max(sizeof(struct leaf),
2024                                               sizeof(struct leaf_info)),
2025                                           0, SLAB_PANIC, NULL);
2026}
2027
2028
2029/* Fix more generic FIB names for init later */
2030struct fib_table *fib_hash_table(u32 id)
2031{
2032        struct fib_table *tb;
2033        struct trie *t;
2034
2035        tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2036                     GFP_KERNEL);
2037        if (tb == NULL)
2038                return NULL;
2039
2040        tb->tb_id = id;
2041        tb->tb_default = -1;
2042        tb->tb_lookup = fn_trie_lookup;
2043        tb->tb_insert = fn_trie_insert;
2044        tb->tb_delete = fn_trie_delete;
2045        tb->tb_flush = fn_trie_flush;
2046        tb->tb_select_default = fn_trie_select_default;
2047        tb->tb_dump = fn_trie_dump;
2048
2049        t = (struct trie *) tb->tb_data;
2050        memset(t, 0, sizeof(*t));
2051
2052        if (id == RT_TABLE_LOCAL)
2053                pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2054
2055        return tb;
2056}
2057
2058#ifdef CONFIG_PROC_FS
2059/* Depth first Trie walk iterator */
2060struct fib_trie_iter {
2061        struct seq_net_private p;
2062        struct fib_table *tb;
2063        struct tnode *tnode;
2064        unsigned index;
2065        unsigned depth;
2066};
2067
2068static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2069{
2070        struct tnode *tn = iter->tnode;
2071        unsigned cindex = iter->index;
2072        struct tnode *p;
2073
2074        /* A single entry routing table */
2075        if (!tn)
2076                return NULL;
2077
2078        pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2079                 iter->tnode, iter->index, iter->depth);
2080rescan:
2081        while (cindex < (1<<tn->bits)) {
2082                struct node *n = tnode_get_child_rcu(tn, cindex);
2083
2084                if (n) {
2085                        if (IS_LEAF(n)) {
2086                                iter->tnode = tn;
2087                                iter->index = cindex + 1;
2088                        } else {
2089                                /* push down one level */
2090                                iter->tnode = (struct tnode *) n;
2091                                iter->index = 0;
2092                                ++iter->depth;
2093                        }
2094                        return n;
2095                }
2096
2097                ++cindex;
2098        }
2099
2100        /* Current node exhausted, pop back up */
2101        p = node_parent_rcu((struct node *)tn);
2102        if (p) {
2103                cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2104                tn = p;
2105                --iter->depth;
2106                goto rescan;
2107        }
2108
2109        /* got root? */
2110        return NULL;
2111}
2112
2113static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2114                                       struct trie *t)
2115{
2116        struct node *n;
2117
2118        if (!t)
2119                return NULL;
2120
2121        n = rcu_dereference(t->trie);
2122        if (!n)
2123                return NULL;
2124
2125        if (IS_TNODE(n)) {
2126                iter->tnode = (struct tnode *) n;
2127                iter->index = 0;
2128                iter->depth = 1;
2129        } else {
2130                iter->tnode = NULL;
2131                iter->index = 0;
2132                iter->depth = 0;
2133        }
2134
2135        return n;
2136}
2137
2138static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2139{
2140        struct node *n;
2141        struct fib_trie_iter iter;
2142
2143        memset(s, 0, sizeof(*s));
2144
2145        rcu_read_lock();
2146        for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2147                if (IS_LEAF(n)) {
2148                        struct leaf *l = (struct leaf *)n;
2149                        struct leaf_info *li;
2150                        struct hlist_node *tmp;
2151
2152                        s->leaves++;
2153                        s->totdepth += iter.depth;
2154                        if (iter.depth > s->maxdepth)
2155                                s->maxdepth = iter.depth;
2156
2157                        hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2158                                ++s->prefixes;
2159                } else {
2160                        const struct tnode *tn = (const struct tnode *) n;
2161                        int i;
2162
2163                        s->tnodes++;
2164                        if (tn->bits < MAX_STAT_DEPTH)
2165                                s->nodesizes[tn->bits]++;
2166
2167                        for (i = 0; i < (1<<tn->bits); i++)
2168                                if (!tn->child[i])
2169                                        s->nullpointers++;
2170                }
2171        }
2172        rcu_read_unlock();
2173}
2174
2175/*
2176 *      This outputs /proc/net/fib_triestats
2177 */
2178static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2179{
2180        unsigned i, max, pointers, bytes, avdepth;
2181
2182        if (stat->leaves)
2183                avdepth = stat->totdepth*100 / stat->leaves;
2184        else
2185                avdepth = 0;
2186
2187        seq_printf(seq, "\tAver depth:     %u.%02d\n",
2188                   avdepth / 100, avdepth % 100);
2189        seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2190
2191        seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2192        bytes = sizeof(struct leaf) * stat->leaves;
2193
2194        seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2195        bytes += sizeof(struct leaf_info) * stat->prefixes;
2196
2197        seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2198        bytes += sizeof(struct tnode) * stat->tnodes;
2199
2200        max = MAX_STAT_DEPTH;
2201        while (max > 0 && stat->nodesizes[max-1] == 0)
2202                max--;
2203
2204        pointers = 0;
2205        for (i = 1; i <= max; i++)
2206                if (stat->nodesizes[i] != 0) {
2207                        seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2208                        pointers += (1<<i) * stat->nodesizes[i];
2209                }
2210        seq_putc(seq, '\n');
2211        seq_printf(seq, "\tPointers: %u\n", pointers);
2212
2213        bytes += sizeof(struct node *) * pointers;
2214        seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2215        seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2216}
2217
2218#ifdef CONFIG_IP_FIB_TRIE_STATS
2219static void trie_show_usage(struct seq_file *seq,
2220                            const struct trie_use_stats *stats)
2221{
2222        seq_printf(seq, "\nCounters:\n---------\n");
2223        seq_printf(seq, "gets = %u\n", stats->gets);
2224        seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2225        seq_printf(seq, "semantic match passed = %u\n",
2226                   stats->semantic_match_passed);
2227        seq_printf(seq, "semantic match miss = %u\n",
2228                   stats->semantic_match_miss);
2229        seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2230        seq_printf(seq, "skipped node resize = %u\n\n",
2231                   stats->resize_node_skipped);
2232}
2233#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2234
2235static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2236{
2237        if (tb->tb_id == RT_TABLE_LOCAL)
2238                seq_puts(seq, "Local:\n");
2239        else if (tb->tb_id == RT_TABLE_MAIN)
2240                seq_puts(seq, "Main:\n");
2241        else
2242                seq_printf(seq, "Id %d:\n", tb->tb_id);
2243}
2244
2245
2246static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2247{
2248        struct net *net = (struct net *)seq->private;
2249        unsigned int h;
2250
2251        seq_printf(seq,
2252                   "Basic info: size of leaf:"
2253                   " %Zd bytes, size of tnode: %Zd bytes.\n",
2254                   sizeof(struct leaf), sizeof(struct tnode));
2255
2256        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2257                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2258                struct hlist_node *node;
2259                struct fib_table *tb;
2260
2261                hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2262                        struct trie *t = (struct trie *) tb->tb_data;
2263                        struct trie_stat stat;
2264
2265                        if (!t)
2266                                continue;
2267
2268                        fib_table_print(seq, tb);
2269
2270                        trie_collect_stats(t, &stat);
2271                        trie_show_stats(seq, &stat);
2272#ifdef CONFIG_IP_FIB_TRIE_STATS
2273                        trie_show_usage(seq, &t->stats);
2274#endif
2275                }
2276        }
2277
2278        return 0;
2279}
2280
2281static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2282{
2283        return single_open_net(inode, file, fib_triestat_seq_show);
2284}
2285
2286static const struct file_operations fib_triestat_fops = {
2287        .owner  = THIS_MODULE,
2288        .open   = fib_triestat_seq_open,
2289        .read   = seq_read,
2290        .llseek = seq_lseek,
2291        .release = single_release_net,
2292};
2293
2294static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2295{
2296        struct fib_trie_iter *iter = seq->private;
2297        struct net *net = seq_file_net(seq);
2298        loff_t idx = 0;
2299        unsigned int h;
2300
2301        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2302                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2303                struct hlist_node *node;
2304                struct fib_table *tb;
2305
2306                hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2307                        struct node *n;
2308
2309                        for (n = fib_trie_get_first(iter,
2310                                                    (struct trie *) tb->tb_data);
2311                             n; n = fib_trie_get_next(iter))
2312                                if (pos == idx++) {
2313                                        iter->tb = tb;
2314                                        return n;
2315                                }
2316                }
2317        }
2318
2319        return NULL;
2320}
2321
2322static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2323        __acquires(RCU)
2324{
2325        rcu_read_lock();
2326        return fib_trie_get_idx(seq, *pos);
2327}
2328
2329static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2330{
2331        struct fib_trie_iter *iter = seq->private;
2332        struct net *net = seq_file_net(seq);
2333        struct fib_table *tb = iter->tb;
2334        struct hlist_node *tb_node;
2335        unsigned int h;
2336        struct node *n;
2337
2338        ++*pos;
2339        /* next node in same table */
2340        n = fib_trie_get_next(iter);
2341        if (n)
2342                return n;
2343
2344        /* walk rest of this hash chain */
2345        h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2346        while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
2347                tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2348                n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2349                if (n)
2350                        goto found;
2351        }
2352
2353        /* new hash chain */
2354        while (++h < FIB_TABLE_HASHSZ) {
2355                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2356                hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2357                        n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2358                        if (n)
2359                                goto found;
2360                }
2361        }
2362        return NULL;
2363
2364found:
2365        iter->tb = tb;
2366        return n;
2367}
2368
2369static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2370        __releases(RCU)
2371{
2372        rcu_read_unlock();
2373}
2374
2375static void seq_indent(struct seq_file *seq, int n)
2376{
2377        while (n-- > 0) seq_puts(seq, "   ");
2378}
2379
2380static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2381{
2382        switch (s) {
2383        case RT_SCOPE_UNIVERSE: return "universe";
2384        case RT_SCOPE_SITE:     return "site";
2385        case RT_SCOPE_LINK:     return "link";
2386        case RT_SCOPE_HOST:     return "host";
2387        case RT_SCOPE_NOWHERE:  return "nowhere";
2388        default:
2389                snprintf(buf, len, "scope=%d", s);
2390                return buf;
2391        }
2392}
2393
2394static const char *rtn_type_names[__RTN_MAX] = {
2395        [RTN_UNSPEC] = "UNSPEC",
2396        [RTN_UNICAST] = "UNICAST",
2397        [RTN_LOCAL] = "LOCAL",
2398        [RTN_BROADCAST] = "BROADCAST",
2399        [RTN_ANYCAST] = "ANYCAST",
2400        [RTN_MULTICAST] = "MULTICAST",
2401        [RTN_BLACKHOLE] = "BLACKHOLE",
2402        [RTN_UNREACHABLE] = "UNREACHABLE",
2403        [RTN_PROHIBIT] = "PROHIBIT",
2404        [RTN_THROW] = "THROW",
2405        [RTN_NAT] = "NAT",
2406        [RTN_XRESOLVE] = "XRESOLVE",
2407};
2408
2409static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2410{
2411        if (t < __RTN_MAX && rtn_type_names[t])
2412                return rtn_type_names[t];
2413        snprintf(buf, len, "type %u", t);
2414        return buf;
2415}
2416
2417/* Pretty print the trie */
2418static int fib_trie_seq_show(struct seq_file *seq, void *v)
2419{
2420        const struct fib_trie_iter *iter = seq->private;
2421        struct node *n = v;
2422
2423        if (!node_parent_rcu(n))
2424                fib_table_print(seq, iter->tb);
2425
2426        if (IS_TNODE(n)) {
2427                struct tnode *tn = (struct tnode *) n;
2428                __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2429
2430                seq_indent(seq, iter->depth-1);
2431                seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
2432                           &prf, tn->pos, tn->bits, tn->full_children,
2433                           tn->empty_children);
2434
2435        } else {
2436                struct leaf *l = (struct leaf *) n;
2437                struct leaf_info *li;
2438                struct hlist_node *node;
2439                __be32 val = htonl(l->key);
2440
2441                seq_indent(seq, iter->depth);
2442                seq_printf(seq, "  |-- %pI4\n", &val);
2443
2444                hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2445                        struct fib_alias *fa;
2446
2447                        list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2448                                char buf1[32], buf2[32];
2449
2450                                seq_indent(seq, iter->depth+1);
2451                                seq_printf(seq, "  /%d %s %s", li->plen,
2452                                           rtn_scope(buf1, sizeof(buf1),
2453                                                     fa->fa_scope),
2454                                           rtn_type(buf2, sizeof(buf2),
2455                                                    fa->fa_type));
2456                                if (fa->fa_tos)
2457                                        seq_printf(seq, " tos=%d", fa->fa_tos);
2458                                seq_putc(seq, '\n');
2459                        }
2460                }
2461        }
2462
2463        return 0;
2464}
2465
2466static const struct seq_operations fib_trie_seq_ops = {
2467        .start  = fib_trie_seq_start,
2468        .next   = fib_trie_seq_next,
2469        .stop   = fib_trie_seq_stop,
2470        .show   = fib_trie_seq_show,
2471};
2472
2473static int fib_trie_seq_open(struct inode *inode, struct file *file)
2474{
2475        return seq_open_net(inode, file, &fib_trie_seq_ops,
2476                            sizeof(struct fib_trie_iter));
2477}
2478
2479static const struct file_operations fib_trie_fops = {
2480        .owner  = THIS_MODULE,
2481        .open   = fib_trie_seq_open,
2482        .read   = seq_read,
2483        .llseek = seq_lseek,
2484        .release = seq_release_net,
2485};
2486
2487struct fib_route_iter {
2488        struct seq_net_private p;
2489        struct trie *main_trie;
2490        loff_t  pos;
2491        t_key   key;
2492};
2493
2494static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2495{
2496        struct leaf *l = NULL;
2497        struct trie *t = iter->main_trie;
2498
2499        /* use cache location of last found key */
2500        if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2501                pos -= iter->pos;
2502        else {
2503                iter->pos = 0;
2504                l = trie_firstleaf(t);
2505        }
2506
2507        while (l && pos-- > 0) {
2508                iter->pos++;
2509                l = trie_nextleaf(l);
2510        }
2511
2512        if (l)
2513                iter->key = pos;        /* remember it */
2514        else
2515                iter->pos = 0;          /* forget it */
2516
2517        return l;
2518}
2519
2520static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2521        __acquires(RCU)
2522{
2523        struct fib_route_iter *iter = seq->private;
2524        struct fib_table *tb;
2525
2526        rcu_read_lock();
2527        tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2528        if (!tb)
2529                return NULL;
2530
2531        iter->main_trie = (struct trie *) tb->tb_data;
2532        if (*pos == 0)
2533                return SEQ_START_TOKEN;
2534        else
2535                return fib_route_get_idx(iter, *pos - 1);
2536}
2537
2538static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2539{
2540        struct fib_route_iter *iter = seq->private;
2541        struct leaf *l = v;
2542
2543        ++*pos;
2544        if (v == SEQ_START_TOKEN) {
2545                iter->pos = 0;
2546                l = trie_firstleaf(iter->main_trie);
2547        } else {
2548                iter->pos++;
2549                l = trie_nextleaf(l);
2550        }
2551
2552        if (l)
2553                iter->key = l->key;
2554        else
2555                iter->pos = 0;
2556        return l;
2557}
2558
2559static void fib_route_seq_stop(struct seq_file *seq, void *v)
2560        __releases(RCU)
2561{
2562        rcu_read_unlock();
2563}
2564
2565static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2566{
2567        static unsigned type2flags[RTN_MAX + 1] = {
2568                [7] = RTF_REJECT, [8] = RTF_REJECT,
2569        };
2570        unsigned flags = type2flags[type];
2571
2572        if (fi && fi->fib_nh->nh_gw)
2573                flags |= RTF_GATEWAY;
2574        if (mask == htonl(0xFFFFFFFF))
2575                flags |= RTF_HOST;
2576        flags |= RTF_UP;
2577        return flags;
2578}
2579
2580/*
2581 *      This outputs /proc/net/route.
2582 *      The format of the file is not supposed to be changed
2583 *      and needs to be same as fib_hash output to avoid breaking
2584 *      legacy utilities
2585 */
2586static int fib_route_seq_show(struct seq_file *seq, void *v)
2587{
2588        struct leaf *l = v;
2589        struct leaf_info *li;
2590        struct hlist_node *node;
2591
2592        if (v == SEQ_START_TOKEN) {
2593                seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2594                           "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2595                           "\tWindow\tIRTT");
2596                return 0;
2597        }
2598
2599        hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2600                struct fib_alias *fa;
2601                __be32 mask, prefix;
2602
2603                mask = inet_make_mask(li->plen);
2604                prefix = htonl(l->key);
2605
2606                list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2607                        const struct fib_info *fi = fa->fa_info;
2608                        unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2609                        int len;
2610
2611                        if (fa->fa_type == RTN_BROADCAST
2612                            || fa->fa_type == RTN_MULTICAST)
2613                                continue;
2614
2615                        if (fi)
2616                                seq_printf(seq,
2617                                         "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2618                                         "%d\t%08X\t%d\t%u\t%u%n",
2619                                         fi->fib_dev ? fi->fib_dev->name : "*",
2620                                         prefix,
2621                                         fi->fib_nh->nh_gw, flags, 0, 0,
2622                                         fi->fib_priority,
2623                                         mask,
2624                                         (fi->fib_advmss ?
2625                                          fi->fib_advmss + 40 : 0),
2626                                         fi->fib_window,
2627                                         fi->fib_rtt >> 3, &len);
2628                        else
2629                                seq_printf(seq,
2630                                         "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2631                                         "%d\t%08X\t%d\t%u\t%u%n",
2632                                         prefix, 0, flags, 0, 0, 0,
2633                                         mask, 0, 0, 0, &len);
2634
2635                        seq_printf(seq, "%*s\n", 127 - len, "");
2636                }
2637        }
2638
2639        return 0;
2640}
2641
2642static const struct seq_operations fib_route_seq_ops = {
2643        .start  = fib_route_seq_start,
2644        .next   = fib_route_seq_next,
2645        .stop   = fib_route_seq_stop,
2646        .show   = fib_route_seq_show,
2647};
2648
2649static int fib_route_seq_open(struct inode *inode, struct file *file)
2650{
2651        return seq_open_net(inode, file, &fib_route_seq_ops,
2652                            sizeof(struct fib_route_iter));
2653}
2654
2655static const struct file_operations fib_route_fops = {
2656        .owner  = THIS_MODULE,
2657        .open   = fib_route_seq_open,
2658        .read   = seq_read,
2659        .llseek = seq_lseek,
2660        .release = seq_release_net,
2661};
2662
2663int __net_init fib_proc_init(struct net *net)
2664{
2665        if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2666                goto out1;
2667
2668        if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2669                                  &fib_triestat_fops))
2670                goto out2;
2671
2672        if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2673                goto out3;
2674
2675        return 0;
2676
2677out3:
2678        proc_net_remove(net, "fib_triestat");
2679out2:
2680        proc_net_remove(net, "fib_trie");
2681out1:
2682        return -ENOMEM;
2683}
2684
2685void __net_exit fib_proc_exit(struct net *net)
2686{
2687        proc_net_remove(net, "fib_trie");
2688        proc_net_remove(net, "fib_triestat");
2689        proc_net_remove(net, "route");
2690}
2691
2692#endif /* CONFIG_PROC_FS */
2693
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.