linux/net/ipv4/fib_trie.c History
<<
>>
Prefs
   1/*
   2 *   This program is free software; you can redistribute it and/or
   3 *   modify it under the terms of the GNU General Public License
   4 *   as published by the Free Software Foundation; either version
   5 *   2 of the License, or (at your option) any later version.
   6 *
   7 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
   8 *     & Swedish University of Agricultural Sciences.
   9 *
  10 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
  11 *     Agricultural Sciences.
  12 *
  13 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
  14 *
  15 * This work is based on the LPC-trie which is originally descibed in:
  16 *
  17 * An experimental study of compression methods for dynamic tries
  18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
  19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
  20 *
  21 *
  22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
  23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
  24 *
  25 *
  26 * Code from fib_hash has been reused which includes the following header:
  27 *
  28 *
  29 * INET         An implementation of the TCP/IP protocol suite for the LINUX
  30 *              operating system.  INET is implemented using the  BSD Socket
  31 *              interface as the means of communication with the user level.
  32 *
  33 *              IPv4 FIB: lookup engine and maintenance routines.
  34 *
  35 *
  36 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  37 *
  38 *              This program is free software; you can redistribute it and/or
  39 *              modify it under the terms of the GNU General Public License
  40 *              as published by the Free Software Foundation; either version
  41 *              2 of the License, or (at your option) any later version.
  42 *
  43 * Substantial contributions to this work comes from:
  44 *
  45 *              David S. Miller, <davem@davemloft.net>
  46 *              Stephen Hemminger <shemminger@osdl.org>
  47 *              Paul E. McKenney <paulmck@us.ibm.com>
  48 *              Patrick McHardy <kaber@trash.net>
  49 */
  50
  51#define VERSION "0.409"
  52
  53#include <asm/uaccess.h>
  54#include <asm/system.h>
  55#include <linux/bitops.h>
  56#include <linux/types.h>
  57#include <linux/kernel.h>
  58#include <linux/mm.h>
  59#include <linux/string.h>
  60#include <linux/socket.h>
  61#include <linux/sockios.h>
  62#include <linux/errno.h>
  63#include <linux/in.h>
  64#include <linux/inet.h>
  65#include <linux/inetdevice.h>
  66#include <linux/netdevice.h>
  67#include <linux/if_arp.h>
  68#include <linux/proc_fs.h>
  69#include <linux/rcupdate.h>
  70#include <linux/skbuff.h>
  71#include <linux/netlink.h>
  72#include <linux/init.h>
  73#include <linux/list.h>
  74#include <net/net_namespace.h>
  75#include <net/ip.h>
  76#include <net/protocol.h>
  77#include <net/route.h>
  78#include <net/tcp.h>
  79#include <net/sock.h>
  80#include <net/ip_fib.h>
  81#include "fib_lookup.h"
  82
  83#define MAX_STAT_DEPTH 32
  84
  85#define KEYLENGTH (8*sizeof(t_key))
  86
  87typedef unsigned int t_key;
  88
  89#define T_TNODE 0
  90#define T_LEAF  1
  91#define NODE_TYPE_MASK  0x1UL
  92#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
  93
  94#define IS_TNODE(n) (!(n->parent & T_LEAF))
  95#define IS_LEAF(n) (n->parent & T_LEAF)
  96
  97struct node {
  98        unsigned long parent;
  99        t_key key;
 100};
 101
 102struct leaf {
 103        unsigned long parent;
 104        t_key key;
 105        struct hlist_head list;
 106        struct rcu_head rcu;
 107};
 108
 109struct leaf_info {
 110        struct hlist_node hlist;
 111        struct rcu_head rcu;
 112        int plen;
 113        struct list_head falh;
 114};
 115
 116struct tnode {
 117        unsigned long parent;
 118        t_key key;
 119        unsigned char pos;              /* 2log(KEYLENGTH) bits needed */
 120        unsigned char bits;             /* 2log(KEYLENGTH) bits needed */
 121        unsigned int full_children;     /* KEYLENGTH bits needed */
 122        unsigned int empty_children;    /* KEYLENGTH bits needed */
 123        union {
 124                struct rcu_head rcu;
 125                struct work_struct work;
 126                struct tnode *tnode_free;
 127        };
 128        struct node *child[0];
 129};
 130
 131#ifdef CONFIG_IP_FIB_TRIE_STATS
 132struct trie_use_stats {
 133        unsigned int gets;
 134        unsigned int backtrack;
 135        unsigned int semantic_match_passed;
 136        unsigned int semantic_match_miss;
 137        unsigned int null_node_hit;
 138        unsigned int resize_node_skipped;
 139};
 140#endif
 141
 142struct trie_stat {
 143        unsigned int totdepth;
 144        unsigned int maxdepth;
 145        unsigned int tnodes;
 146        unsigned int leaves;
 147        unsigned int nullpointers;
 148        unsigned int prefixes;
 149        unsigned int nodesizes[MAX_STAT_DEPTH];
 150};
 151
 152struct trie {
 153        struct node *trie;
 154#ifdef CONFIG_IP_FIB_TRIE_STATS
 155        struct trie_use_stats stats;
 156#endif
 157};
 158
 159static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
 160static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
 161                                  int wasfull);
 162static struct node *resize(struct trie *t, struct tnode *tn);
 163static struct tnode *inflate(struct trie *t, struct tnode *tn);
 164static struct tnode *halve(struct trie *t, struct tnode *tn);
 165/* tnodes to free after resize(); protected by RTNL */
 166static struct tnode *tnode_free_head;
 167static size_t tnode_free_size;
 168
 169/*
 170 * synchronize_rcu after call_rcu for that many pages; it should be especially
 171 * useful before resizing the root node with PREEMPT_NONE configs; the value was
 172 * obtained experimentally, aiming to avoid visible slowdown.
 173 */
 174static const int sync_pages = 128;
 175
 176static struct kmem_cache *fn_alias_kmem __read_mostly;
 177static struct kmem_cache *trie_leaf_kmem __read_mostly;
 178
 179static inline struct tnode *node_parent(struct node *node)
 180{
 181        return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
 182}
 183
 184static inline struct tnode *node_parent_rcu(struct node *node)
 185{
 186        struct tnode *ret = node_parent(node);
 187
 188        return rcu_dereference(ret);
 189}
 190
 191/* Same as rcu_assign_pointer
 192 * but that macro() assumes that value is a pointer.
 193 */
 194static inline void node_set_parent(struct node *node, struct tnode *ptr)
 195{
 196        smp_wmb();
 197        node->parent = (unsigned long)ptr | NODE_TYPE(node);
 198}
 199
 200static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
 201{
 202        BUG_ON(i >= 1U << tn->bits);
 203
 204        return tn->child[i];
 205}
 206
 207static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
 208{
 209        struct node *ret = tnode_get_child(tn, i);
 210
 211        return rcu_dereference(ret);
 212}
 213
 214static inline int tnode_child_length(const struct tnode *tn)
 215{
 216        return 1 << tn->bits;
 217}
 218
 219static inline t_key mask_pfx(t_key k, unsigned short l)
 220{
 221        return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
 222}
 223
 224static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
 225{
 226        if (offset < KEYLENGTH)
 227                return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
 228        else
 229                return 0;
 230}
 231
 232static inline int tkey_equals(t_key a, t_key b)
 233{
 234        return a == b;
 235}
 236
 237static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
 238{
 239        if (bits == 0 || offset >= KEYLENGTH)
 240                return 1;
 241        bits = bits > KEYLENGTH ? KEYLENGTH : bits;
 242        return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
 243}
 244
 245static inline int tkey_mismatch(t_key a, int offset, t_key b)
 246{
 247        t_key diff = a ^ b;
 248        int i = offset;
 249
 250        if (!diff)
 251                return 0;
 252        while ((diff << i) >> (KEYLENGTH-1) == 0)
 253                i++;
 254        return i;
 255}
 256
 257/*
 258  To understand this stuff, an understanding of keys and all their bits is
 259  necessary. Every node in the trie has a key associated with it, but not
 260  all of the bits in that key are significant.
 261
 262  Consider a node 'n' and its parent 'tp'.
 263
 264  If n is a leaf, every bit in its key is significant. Its presence is
 265  necessitated by path compression, since during a tree traversal (when
 266  searching for a leaf - unless we are doing an insertion) we will completely
 267  ignore all skipped bits we encounter. Thus we need to verify, at the end of
 268  a potentially successful search, that we have indeed been walking the
 269  correct key path.
 270
 271  Note that we can never "miss" the correct key in the tree if present by
 272  following the wrong path. Path compression ensures that segments of the key
 273  that are the same for all keys with a given prefix are skipped, but the
 274  skipped part *is* identical for each node in the subtrie below the skipped
 275  bit! trie_insert() in this implementation takes care of that - note the
 276  call to tkey_sub_equals() in trie_insert().
 277
 278  if n is an internal node - a 'tnode' here, the various parts of its key
 279  have many different meanings.
 280
 281  Example:
 282  _________________________________________________________________
 283  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
 284  -----------------------------------------------------------------
 285    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 286
 287  _________________________________________________________________
 288  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
 289  -----------------------------------------------------------------
 290   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
 291
 292  tp->pos = 7
 293  tp->bits = 3
 294  n->pos = 15
 295  n->bits = 4
 296
 297  First, let's just ignore the bits that come before the parent tp, that is
 298  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
 299  not use them for anything.
 300
 301  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
 302  index into the parent's child array. That is, they will be used to find
 303  'n' among tp's children.
 304
 305  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
 306  for the node n.
 307
 308  All the bits we have seen so far are significant to the node n. The rest
 309  of the bits are really not needed or indeed known in n->key.
 310
 311  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
 312  n's child array, and will of course be different for each child.
 313
 314
 315  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
 316  at this point.
 317
 318*/
 319
 320static inline void check_tnode(const struct tnode *tn)
 321{
 322        WARN_ON(tn && tn->pos+tn->bits > 32);
 323}
 324
 325static const int halve_threshold = 25;
 326static const int inflate_threshold = 50;
 327static const int halve_threshold_root = 15;
 328static const int inflate_threshold_root = 30;
 329
 330static void __alias_free_mem(struct rcu_head *head)
 331{
 332        struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
 333        kmem_cache_free(fn_alias_kmem, fa);
 334}
 335
 336static inline void alias_free_mem_rcu(struct fib_alias *fa)
 337{
 338        call_rcu(&fa->rcu, __alias_free_mem);
 339}
 340
 341static void __leaf_free_rcu(struct rcu_head *head)
 342{
 343        struct leaf *l = container_of(head, struct leaf, rcu);
 344        kmem_cache_free(trie_leaf_kmem, l);
 345}
 346
 347static inline void free_leaf(struct leaf *l)
 348{
 349        call_rcu_bh(&l->rcu, __leaf_free_rcu);
 350}
 351
 352static void __leaf_info_free_rcu(struct rcu_head *head)
 353{
 354        kfree(container_of(head, struct leaf_info, rcu));
 355}
 356
 357static inline void free_leaf_info(struct leaf_info *leaf)
 358{
 359        call_rcu(&leaf->rcu, __leaf_info_free_rcu);
 360}
 361
 362static struct tnode *tnode_alloc(size_t size)
 363{
 364        if (size <= PAGE_SIZE)
 365                return kzalloc(size, GFP_KERNEL);
 366        else
 367                return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
 368}
 369
 370static void __tnode_vfree(struct work_struct *arg)
 371{
 372        struct tnode *tn = container_of(arg, struct tnode, work);
 373        vfree(tn);
 374}
 375
 376static void __tnode_free_rcu(struct rcu_head *head)
 377{
 378        struct tnode *tn = container_of(head, struct tnode, rcu);
 379        size_t size = sizeof(struct tnode) +
 380                      (sizeof(struct node *) << tn->bits);
 381
 382        if (size <= PAGE_SIZE)
 383                kfree(tn);
 384        else {
 385                INIT_WORK(&tn->work, __tnode_vfree);
 386                schedule_work(&tn->work);
 387        }
 388}
 389
 390static inline void tnode_free(struct tnode *tn)
 391{
 392        if (IS_LEAF(tn))
 393                free_leaf((struct leaf *) tn);
 394        else
 395                call_rcu(&tn->rcu, __tnode_free_rcu);
 396}
 397
 398static void tnode_free_safe(struct tnode *tn)
 399{
 400        BUG_ON(IS_LEAF(tn));
 401        tn->tnode_free = tnode_free_head;
 402        tnode_free_head = tn;
 403        tnode_free_size += sizeof(struct tnode) +
 404                           (sizeof(struct node *) << tn->bits);
 405}
 406
 407static void tnode_free_flush(void)
 408{
 409        struct tnode *tn;
 410
 411        while ((tn = tnode_free_head)) {
 412                tnode_free_head = tn->tnode_free;
 413                tn->tnode_free = NULL;
 414                tnode_free(tn);
 415        }
 416
 417        if (tnode_free_size >= PAGE_SIZE * sync_pages) {
 418                tnode_free_size = 0;
 419                synchronize_rcu();
 420        }
 421}
 422
 423static struct leaf *leaf_new(void)
 424{
 425        struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
 426        if (l) {
 427                l->parent = T_LEAF;
 428                INIT_HLIST_HEAD(&l->list);
 429        }
 430        return l;
 431}
 432
 433static struct leaf_info *leaf_info_new(int plen)
 434{
 435        struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
 436        if (li) {
 437                li->plen = plen;
 438                INIT_LIST_HEAD(&li->falh);
 439        }
 440        return li;
 441}
 442
 443static struct tnode *tnode_new(t_key key, int pos, int bits)
 444{
 445        size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
 446        struct tnode *tn = tnode_alloc(sz);
 447
 448        if (tn) {
 449                tn->parent = T_TNODE;
 450                tn->pos = pos;
 451                tn->bits = bits;
 452                tn->key = key;
 453                tn->full_children = 0;
 454                tn->empty_children = 1<<bits;
 455        }
 456
 457        pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
 458                 (unsigned long) (sizeof(struct node) << bits));
 459        return tn;
 460}
 461
 462/*
 463 * Check whether a tnode 'n' is "full", i.e. it is an internal node
 464 * and no bits are skipped. See discussion in dyntree paper p. 6
 465 */
 466
 467static inline int tnode_full(const struct tnode *tn, const struct node *n)
 468{
 469        if (n == NULL || IS_LEAF(n))
 470                return 0;
 471
 472        return ((struct tnode *) n)->pos == tn->pos + tn->bits;
 473}
 474
 475static inline void put_child(struct trie *t, struct tnode *tn, int i,
 476                             struct node *n)
 477{
 478        tnode_put_child_reorg(tn, i, n, -1);
 479}
 480
 481 /*
 482  * Add a child at position i overwriting the old value.
 483  * Update the value of full_children and empty_children.
 484  */
 485
 486static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
 487                                  int wasfull)
 488{
 489        struct node *chi = tn->child[i];
 490        int isfull;
 491
 492        BUG_ON(i >= 1<<tn->bits);
 493
 494        /* update emptyChildren */
 495        if (n == NULL && chi != NULL)
 496                tn->empty_children++;
 497        else if (n != NULL && chi == NULL)
 498                tn->empty_children--;
 499
 500        /* update fullChildren */
 501        if (wasfull == -1)
 502                wasfull = tnode_full(tn, chi);
 503
 504        isfull = tnode_full(tn, n);
 505        if (wasfull && !isfull)
 506                tn->full_children--;
 507        else if (!wasfull && isfull)
 508                tn->full_children++;
 509
 510        if (n)
 511                node_set_parent(n, tn);
 512
 513        rcu_assign_pointer(tn->child[i], n);
 514}
 515
 516#define MAX_WORK 10
 517static struct node *resize(struct trie *t, struct tnode *tn)
 518{
 519        int i;
 520        struct tnode *old_tn;
 521        int inflate_threshold_use;
 522        int halve_threshold_use;
 523        int max_work;
 524
 525        if (!tn)
 526                return NULL;
 527
 528        pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
 529                 tn, inflate_threshold, halve_threshold);
 530
 531        /* No children */
 532        if (tn->empty_children == tnode_child_length(tn)) {
 533                tnode_free_safe(tn);
 534                return NULL;
 535        }
 536        /* One child */
 537        if (tn->empty_children == tnode_child_length(tn) - 1)
 538                goto one_child;
 539        /*
 540         * Double as long as the resulting node has a number of
 541         * nonempty nodes that are above the threshold.
 542         */
 543
 544        /*
 545         * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
 546         * the Helsinki University of Technology and Matti Tikkanen of Nokia
 547         * Telecommunications, page 6:
 548         * "A node is doubled if the ratio of non-empty children to all
 549         * children in the *doubled* node is at least 'high'."
 550         *
 551         * 'high' in this instance is the variable 'inflate_threshold'. It
 552         * is expressed as a percentage, so we multiply it with
 553         * tnode_child_length() and instead of multiplying by 2 (since the
 554         * child array will be doubled by inflate()) and multiplying
 555         * the left-hand side by 100 (to handle the percentage thing) we
 556         * multiply the left-hand side by 50.
 557         *
 558         * The left-hand side may look a bit weird: tnode_child_length(tn)
 559         * - tn->empty_children is of course the number of non-null children
 560         * in the current node. tn->full_children is the number of "full"
 561         * children, that is non-null tnodes with a skip value of 0.
 562         * All of those will be doubled in the resulting inflated tnode, so
 563         * we just count them one extra time here.
 564         *
 565         * A clearer way to write this would be:
 566         *
 567         * to_be_doubled = tn->full_children;
 568         * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
 569         *     tn->full_children;
 570         *
 571         * new_child_length = tnode_child_length(tn) * 2;
 572         *
 573         * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
 574         *      new_child_length;
 575         * if (new_fill_factor >= inflate_threshold)
 576         *
 577         * ...and so on, tho it would mess up the while () loop.
 578         *
 579         * anyway,
 580         * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
 581         *      inflate_threshold
 582         *
 583         * avoid a division:
 584         * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
 585         *      inflate_threshold * new_child_length
 586         *
 587         * expand not_to_be_doubled and to_be_doubled, and shorten:
 588         * 100 * (tnode_child_length(tn) - tn->empty_children +
 589         *    tn->full_children) >= inflate_threshold * new_child_length
 590         *
 591         * expand new_child_length:
 592         * 100 * (tnode_child_length(tn) - tn->empty_children +
 593         *    tn->full_children) >=
 594         *      inflate_threshold * tnode_child_length(tn) * 2
 595         *
 596         * shorten again:
 597         * 50 * (tn->full_children + tnode_child_length(tn) -
 598         *    tn->empty_children) >= inflate_threshold *
 599         *    tnode_child_length(tn)
 600         *
 601         */
 602
 603        check_tnode(tn);
 604
 605        /* Keep root node larger  */
 606
 607        if (!node_parent((struct node*) tn)) {
 608                inflate_threshold_use = inflate_threshold_root;
 609                halve_threshold_use = halve_threshold_root;
 610        }
 611        else {
 612                inflate_threshold_use = inflate_threshold;
 613                halve_threshold_use = halve_threshold;
 614        }
 615
 616        max_work = MAX_WORK;
 617        while ((tn->full_children > 0 &&  max_work-- &&
 618                50 * (tn->full_children + tnode_child_length(tn)
 619                      - tn->empty_children)
 620                >= inflate_threshold_use * tnode_child_length(tn))) {
 621
 622                old_tn = tn;
 623                tn = inflate(t, tn);
 624
 625                if (IS_ERR(tn)) {
 626                        tn = old_tn;
 627#ifdef CONFIG_IP_FIB_TRIE_STATS
 628                        t->stats.resize_node_skipped++;
 629#endif
 630                        break;
 631                }
 632        }
 633
 634        check_tnode(tn);
 635
 636        /* Return if at least one inflate is run */
 637        if( max_work != MAX_WORK)
 638                return (struct node *) tn;
 639
 640        /*
 641         * Halve as long as the number of empty children in this
 642         * node is above threshold.
 643         */
 644
 645        max_work = MAX_WORK;
 646        while (tn->bits > 1 &&  max_work-- &&
 647               100 * (tnode_child_length(tn) - tn->empty_children) <
 648               halve_threshold_use * tnode_child_length(tn)) {
 649
 650                old_tn = tn;
 651                tn = halve(t, tn);
 652                if (IS_ERR(tn)) {
 653                        tn = old_tn;
 654#ifdef CONFIG_IP_FIB_TRIE_STATS
 655                        t->stats.resize_node_skipped++;
 656#endif
 657                        break;
 658                }
 659        }
 660
 661
 662        /* Only one child remains */
 663        if (tn->empty_children == tnode_child_length(tn) - 1) {
 664one_child:
 665                for (i = 0; i < tnode_child_length(tn); i++) {
 666                        struct node *n;
 667
 668                        n = tn->child[i];
 669                        if (!n)
 670                                continue;
 671
 672                        /* compress one level */
 673
 674                        node_set_parent(n, NULL);
 675                        tnode_free_safe(tn);
 676                        return n;
 677                }
 678        }
 679        return (struct node *) tn;
 680}
 681
 682static struct tnode *inflate(struct trie *t, struct tnode *tn)
 683{
 684        struct tnode *oldtnode = tn;
 685        int olen = tnode_child_length(tn);
 686        int i;
 687
 688        pr_debug("In inflate\n");
 689
 690        tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
 691
 692        if (!tn)
 693                return ERR_PTR(-ENOMEM);
 694
 695        /*
 696         * Preallocate and store tnodes before the actual work so we
 697         * don't get into an inconsistent state if memory allocation
 698         * fails. In case of failure we return the oldnode and  inflate
 699         * of tnode is ignored.
 700         */
 701
 702        for (i = 0; i < olen; i++) {
 703                struct tnode *inode;
 704
 705                inode = (struct tnode *) tnode_get_child(oldtnode, i);
 706                if (inode &&
 707                    IS_TNODE(inode) &&
 708                    inode->pos == oldtnode->pos + oldtnode->bits &&
 709                    inode->bits > 1) {
 710                        struct tnode *left, *right;
 711                        t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
 712
 713                        left = tnode_new(inode->key&(~m), inode->pos + 1,
 714                                         inode->bits - 1);
 715                        if (!left)
 716                                goto nomem;
 717
 718                        right = tnode_new(inode->key|m, inode->pos + 1,
 719                                          inode->bits - 1);
 720
 721                        if (!right) {
 722                                tnode_free(left);
 723                                goto nomem;
 724                        }
 725
 726                        put_child(t, tn, 2*i, (struct node *) left);
 727                        put_child(t, tn, 2*i+1, (struct node *) right);
 728                }
 729        }
 730
 731        for (i = 0; i < olen; i++) {
 732                struct tnode *inode;
 733                struct node *node = tnode_get_child(oldtnode, i);
 734                struct tnode *left, *right;
 735                int size, j;
 736
 737                /* An empty child */
 738                if (node == NULL)
 739                        continue;
 740
 741                /* A leaf or an internal node with skipped bits */
 742
 743                if (IS_LEAF(node) || ((struct tnode *) node)->pos >
 744                   tn->pos + tn->bits - 1) {
 745                        if (tkey_extract_bits(node->key,
 746                                              oldtnode->pos + oldtnode->bits,
 747                                              1) == 0)
 748                                put_child(t, tn, 2*i, node);
 749                        else
 750                                put_child(t, tn, 2*i+1, node);
 751                        continue;
 752                }
 753
 754                /* An internal node with two children */
 755                inode = (struct tnode *) node;
 756
 757                if (inode->bits == 1) {
 758                        put_child(t, tn, 2*i, inode->child[0]);
 759                        put_child(t, tn, 2*i+1, inode->child[1]);
 760
 761                        tnode_free_safe(inode);
 762                        continue;
 763                }
 764
 765                /* An internal node with more than two children */
 766
 767                /* We will replace this node 'inode' with two new
 768                 * ones, 'left' and 'right', each with half of the
 769                 * original children. The two new nodes will have
 770                 * a position one bit further down the key and this
 771                 * means that the "significant" part of their keys
 772                 * (see the discussion near the top of this file)
 773                 * will differ by one bit, which will be "0" in
 774                 * left's key and "1" in right's key. Since we are
 775                 * moving the key position by one step, the bit that
 776                 * we are moving away from - the bit at position
 777                 * (inode->pos) - is the one that will differ between
 778                 * left and right. So... we synthesize that bit in the
 779                 * two  new keys.
 780                 * The mask 'm' below will be a single "one" bit at
 781                 * the position (inode->pos)
 782                 */
 783
 784                /* Use the old key, but set the new significant
 785                 *   bit to zero.
 786                 */
 787
 788                left = (struct tnode *) tnode_get_child(tn, 2*i);
 789                put_child(t, tn, 2*i, NULL);
 790
 791                BUG_ON(!left);
 792
 793                right = (struct tnode *) tnode_get_child(tn, 2*i+1);
 794                put_child(t, tn, 2*i+1, NULL);
 795
 796                BUG_ON(!right);
 797
 798                size = tnode_child_length(left);
 799                for (j = 0; j < size; j++) {
 800                        put_child(t, left, j, inode->child[j]);
 801                        put_child(t, right, j, inode->child[j + size]);
 802                }
 803                put_child(t, tn, 2*i, resize(t, left));
 804                put_child(t, tn, 2*i+1, resize(t, right));
 805
 806                tnode_free_safe(inode);
 807        }
 808        tnode_free_safe(oldtnode);
 809        return tn;
 810nomem:
 811        {
 812                int size = tnode_child_length(tn);
 813                int j;
 814
 815                for (j = 0; j < size; j++)
 816                        if (tn->child[j])
 817                                tnode_free((struct tnode *)tn->child[j]);
 818
 819                tnode_free(tn);
 820
 821                return ERR_PTR(-ENOMEM);
 822        }
 823}
 824
 825static struct tnode *halve(struct trie *t, struct tnode *tn)
 826{
 827        struct tnode *oldtnode = tn;
 828        struct node *left, *right;
 829        int i;
 830        int olen = tnode_child_length(tn);
 831
 832        pr_debug("In halve\n");
 833
 834        tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
 835
 836        if (!tn)
 837                return ERR_PTR(-ENOMEM);
 838
 839        /*
 840         * Preallocate and store tnodes before the actual work so we
 841         * don't get into an inconsistent state if memory allocation
 842         * fails. In case of failure we return the oldnode and halve
 843         * of tnode is ignored.
 844         */
 845
 846        for (i = 0; i < olen; i += 2) {
 847                left = tnode_get_child(oldtnode, i);
 848                right = tnode_get_child(oldtnode, i+1);
 849
 850                /* Two nonempty children */
 851                if (left && right) {
 852                        struct tnode *newn;
 853
 854                        newn = tnode_new(left->key, tn->pos + tn->bits, 1);
 855
 856                        if (!newn)
 857                                goto nomem;
 858
 859                        put_child(t, tn, i/2, (struct node *)newn);
 860                }
 861
 862        }
 863
 864        for (i = 0; i < olen; i += 2) {
 865                struct tnode *newBinNode;
 866
 867                left = tnode_get_child(oldtnode, i);
 868                right = tnode_get_child(oldtnode, i+1);
 869
 870                /* At least one of the children is empty */
 871                if (left == NULL) {
 872                        if (right == NULL)    /* Both are empty */
 873                                continue;
 874                        put_child(t, tn, i/2, right);
 875                        continue;
 876                }
 877
 878                if (right == NULL) {
 879                        put_child(t, tn, i/2, left);
 880                        continue;
 881                }
 882
 883                /* Two nonempty children */
 884                newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
 885                put_child(t, tn, i/2, NULL);
 886                put_child(t, newBinNode, 0, left);
 887                put_child(t, newBinNode, 1, right);
 888                put_child(t, tn, i/2, resize(t, newBinNode));
 889        }
 890        tnode_free_safe(oldtnode);
 891        return tn;
 892nomem:
 893        {
 894                int size = tnode_child_length(tn);
 895                int j;
 896
 897                for (j = 0; j < size; j++)
 898                        if (tn->child[j])
 899                                tnode_free((struct tnode *)tn->child[j]);
 900
 901                tnode_free(tn);
 902
 903                return ERR_PTR(-ENOMEM);
 904        }
 905}
 906
 907/* readside must use rcu_read_lock currently dump routines
 908 via get_fa_head and dump */
 909
 910static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
 911{
 912        struct hlist_head *head = &l->list;
 913        struct hlist_node *node;
 914        struct leaf_info *li;
 915
 916        hlist_for_each_entry_rcu(li, node, head, hlist)
 917                if (li->plen == plen)
 918                        return li;
 919
 920        return NULL;
 921}
 922
 923static inline struct list_head *get_fa_head(struct leaf *l, int plen)
 924{
 925        struct leaf_info *li = find_leaf_info(l, plen);
 926
 927        if (!li)
 928                return NULL;
 929
 930        return &li->falh;
 931}
 932
 933static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
 934{
 935        struct leaf_info *li = NULL, *last = NULL;
 936        struct hlist_node *node;
 937
 938        if (hlist_empty(head)) {
 939                hlist_add_head_rcu(&new->hlist, head);
 940        } else {
 941                hlist_for_each_entry(li, node, head, hlist) {
 942                        if (new->plen > li->plen)
 943                                break;
 944
 945                        last = li;
 946                }
 947                if (last)
 948                        hlist_add_after_rcu(&last->hlist, &new->hlist);
 949                else
 950                        hlist_add_before_rcu(&new->hlist, &li->hlist);
 951        }
 952}
 953
 954/* rcu_read_lock needs to be hold by caller from readside */
 955
 956static struct leaf *
 957fib_find_node(struct trie *t, u32 key)
 958{
 959        int pos;
 960        struct tnode *tn;
 961        struct node *n;
 962
 963        pos = 0;
 964        n = rcu_dereference(t->trie);
 965
 966        while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
 967                tn = (struct tnode *) n;
 968
 969                check_tnode(tn);
 970
 971                if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
 972                        pos = tn->pos + tn->bits;
 973                        n = tnode_get_child_rcu(tn,
 974                                                tkey_extract_bits(key,
 975                                                                  tn->pos,
 976                                                                  tn->bits));
 977                } else
 978                        break;
 979        }
 980        /* Case we have found a leaf. Compare prefixes */
 981
 982        if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
 983                return (struct leaf *)n;
 984
 985        return NULL;
 986}
 987
 988static void trie_rebalance(struct trie *t, struct tnode *tn)
 989{
 990        int wasfull;
 991        t_key cindex, key;
 992        struct tnode *tp;
 993
 994        key = tn->key;
 995
 996        while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
 997                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
 998                wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
 999                tn = (struct tnode *) resize(t, (struct tnode *)tn);
1000
1001                tnode_put_child_reorg((struct tnode *)tp, cindex,
1002                                      (struct node *)tn, wasfull);
1003
1004                tp = node_parent((struct node *) tn);
1005                if (!tp)
1006                        rcu_assign_pointer(t->trie, (struct node *)tn);
1007
1008                tnode_free_flush();
1009                if (!tp)
1010                        break;
1011                tn = tp;
1012        }
1013
1014        /* Handle last (top) tnode */
1015        if (IS_TNODE(tn))
1016                tn = (struct tnode *)resize(t, (struct tnode *)tn);
1017
1018        rcu_assign_pointer(t->trie, (struct node *)tn);
1019        tnode_free_flush();
1020
1021        return;
1022}
1023
1024/* only used from updater-side */
1025
1026static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1027{
1028        int pos, newpos;
1029        struct tnode *tp = NULL, *tn = NULL;
1030        struct node *n;
1031        struct leaf *l;
1032        int missbit;
1033        struct list_head *fa_head = NULL;
1034        struct leaf_info *li;
1035        t_key cindex;
1036
1037        pos = 0;
1038        n = t->trie;
1039
1040        /* If we point to NULL, stop. Either the tree is empty and we should
1041         * just put a new leaf in if, or we have reached an empty child slot,
1042         * and we should just put our new leaf in that.
1043         * If we point to a T_TNODE, check if it matches our key. Note that
1044         * a T_TNODE might be skipping any number of bits - its 'pos' need
1045         * not be the parent's 'pos'+'bits'!
1046         *
1047         * If it does match the current key, get pos/bits from it, extract
1048         * the index from our key, push the T_TNODE and walk the tree.
1049         *
1050         * If it doesn't, we have to replace it with a new T_TNODE.
1051         *
1052         * If we point to a T_LEAF, it might or might not have the same key
1053         * as we do. If it does, just change the value, update the T_LEAF's
1054         * value, and return it.
1055         * If it doesn't, we need to replace it with a T_TNODE.
1056         */
1057
1058        while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1059                tn = (struct tnode *) n;
1060
1061                check_tnode(tn);
1062
1063                if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1064                        tp = tn;
1065                        pos = tn->pos + tn->bits;
1066                        n = tnode_get_child(tn,
1067                                            tkey_extract_bits(key,
1068                                                              tn->pos,
1069                                                              tn->bits));
1070
1071                        BUG_ON(n && node_parent(n) != tn);
1072                } else
1073                        break;
1074        }
1075
1076        /*
1077         * n  ----> NULL, LEAF or TNODE
1078         *
1079         * tp is n's (parent) ----> NULL or TNODE
1080         */
1081
1082        BUG_ON(tp && IS_LEAF(tp));
1083
1084        /* Case 1: n is a leaf. Compare prefixes */
1085
1086        if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1087                l = (struct leaf *) n;
1088                li = leaf_info_new(plen);
1089
1090                if (!li)
1091                        return NULL;
1092
1093                fa_head = &li->falh;
1094                insert_leaf_info(&l->list, li);
1095                goto done;
1096        }
1097        l = leaf_new();
1098
1099        if (!l)
1100                return NULL;
1101
1102        l->key = key;
1103        li = leaf_info_new(plen);
1104
1105        if (!li) {
1106                free_leaf(l);
1107                return NULL;
1108        }
1109
1110        fa_head = &li->falh;
1111        insert_leaf_info(&l->list, li);
1112
1113        if (t->trie && n == NULL) {
1114                /* Case 2: n is NULL, and will just insert a new leaf */
1115
1116                node_set_parent((struct node *)l, tp);
1117
1118                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1119                put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1120        } else {
1121                /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1122                /*
1123                 *  Add a new tnode here
1124                 *  first tnode need some special handling
1125                 */
1126
1127                if (tp)
1128                        pos = tp->pos+tp->bits;
1129                else
1130                        pos = 0;
1131
1132                if (n) {
1133                        newpos = tkey_mismatch(key, pos, n->key);
1134                        tn = tnode_new(n->key, newpos, 1);
1135                } else {
1136                        newpos = 0;
1137                        tn = tnode_new(key, newpos, 1); /* First tnode */
1138                }
1139
1140                if (!tn) {
1141                        free_leaf_info(li);
1142                        free_leaf(l);
1143                        return NULL;
1144                }
1145
1146                node_set_parent((struct node *)tn, tp);
1147
1148                missbit = tkey_extract_bits(key, newpos, 1);
1149                put_child(t, tn, missbit, (struct node *)l);
1150                put_child(t, tn, 1-missbit, n);
1151
1152                if (tp) {
1153                        cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1154                        put_child(t, (struct tnode *)tp, cindex,
1155                                  (struct node *)tn);
1156                } else {
1157                        rcu_assign_pointer(t->trie, (struct node *)tn);
1158                        tp = tn;
1159                }
1160        }
1161
1162        if (tp && tp->pos + tp->bits > 32)
1163                pr_warning("fib_trie"
1164                           " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1165                           tp, tp->pos, tp->bits, key, plen);
1166
1167        /* Rebalance the trie */
1168
1169        trie_rebalance(t, tp);
1170done:
1171        return fa_head;
1172}
1173
1174/*
1175 * Caller must hold RTNL.
1176 */
1177static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1178{
1179        struct trie *t = (struct trie *) tb->tb_data;
1180        struct fib_alias *fa, *new_fa;
1181        struct list_head *fa_head = NULL;
1182        struct fib_info *fi;
1183        int plen = cfg->fc_dst_len;
1184        u8 tos = cfg->fc_tos;
1185        u32 key, mask;
1186        int err;
1187        struct leaf *l;
1188
1189        if (plen > 32)
1190                return -EINVAL;
1191
1192        key = ntohl(cfg->fc_dst);
1193
1194        pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1195
1196        mask = ntohl(inet_make_mask(plen));
1197
1198        if (key & ~mask)
1199                return -EINVAL;
1200
1201        key = key & mask;
1202
1203        fi = fib_create_info(cfg);
1204        if (IS_ERR(fi)) {
1205                err = PTR_ERR(fi);
1206                goto err;
1207        }
1208
1209        l = fib_find_node(t, key);
1210        fa = NULL;
1211
1212        if (l) {
1213                fa_head = get_fa_head(l, plen);
1214                fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1215        }
1216
1217        /* Now fa, if non-NULL, points to the first fib alias
1218         * with the same keys [prefix,tos,priority], if such key already
1219         * exists or to the node before which we will insert new one.
1220         *
1221         * If fa is NULL, we will need to allocate a new one and
1222         * insert to the head of f.
1223         *
1224         * If f is NULL, no fib node matched the destination key
1225         * and we need to allocate a new one of those as well.
1226         */
1227
1228        if (fa && fa->fa_tos == tos &&
1229            fa->fa_info->fib_priority == fi->fib_priority) {
1230                struct fib_alias *fa_first, *fa_match;
1231
1232                err = -EEXIST;
1233                if (cfg->fc_nlflags & NLM_F_EXCL)
1234                        goto out;
1235
1236                /* We have 2 goals:
1237                 * 1. Find exact match for type, scope, fib_info to avoid
1238                 * duplicate routes
1239                 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1240                 */
1241                fa_match = NULL;
1242                fa_first = fa;
1243                fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1244                list_for_each_entry_continue(fa, fa_head, fa_list) {
1245                        if (fa->fa_tos != tos)
1246                                break;
1247                        if (fa->fa_info->fib_priority != fi->fib_priority)
1248                                break;
1249                        if (fa->fa_type == cfg->fc_type &&
1250                            fa->fa_scope == cfg->fc_scope &&
1251                            fa->fa_info == fi) {
1252                                fa_match = fa;
1253                                break;
1254                        }
1255                }
1256
1257                if (cfg->fc_nlflags & NLM_F_REPLACE) {
1258                        struct fib_info *fi_drop;
1259                        u8 state;
1260
1261                        fa = fa_first;
1262                        if (fa_match) {
1263                                if (fa == fa_match)
1264                                        err = 0;
1265                                goto out;
1266                        }
1267                        err = -ENOBUFS;
1268                        new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1269                        if (new_fa == NULL)
1270                                goto out;
1271
1272                        fi_drop = fa->fa_info;
1273                        new_fa->fa_tos = fa->fa_tos;
1274                        new_fa->fa_info = fi;
1275                        new_fa->fa_type = cfg->fc_type;
1276                        new_fa->fa_scope = cfg->fc_scope;
1277                        state = fa->fa_state;
1278                        new_fa->fa_state = state & ~FA_S_ACCESSED;
1279
1280                        list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1281                        alias_free_mem_rcu(fa);
1282
1283                        fib_release_info(fi_drop);
1284                        if (state & FA_S_ACCESSED)
1285                                rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1286                        rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1287                                tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1288
1289                        goto succeeded;
1290                }
1291                /* Error if we find a perfect match which
1292                 * uses the same scope, type, and nexthop
1293                 * information.
1294                 */
1295                if (fa_match)
1296                        goto out;
1297
1298                if (!(cfg->fc_nlflags & NLM_F_APPEND))
1299                        fa = fa_first;
1300        }
1301        err = -ENOENT;
1302        if (!(cfg->fc_nlflags & NLM_F_CREATE))
1303                goto out;
1304
1305        err = -ENOBUFS;
1306        new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1307        if (new_fa == NULL)
1308                goto out;
1309
1310        new_fa->fa_info = fi;
1311        new_fa->fa_tos = tos;
1312        new_fa->fa_type = cfg->fc_type;
1313        new_fa->fa_scope = cfg->fc_scope;
1314        new_fa->fa_state = 0;
1315        /*
1316         * Insert new entry to the list.
1317         */
1318
1319        if (!fa_head) {
1320                fa_head = fib_insert_node(t, key, plen);
1321                if (unlikely(!fa_head)) {
1322                        err = -ENOMEM;
1323                        goto out_free_new_fa;
1324                }
1325        }
1326
1327        list_add_tail_rcu(&new_fa->fa_list,
1328                          (fa ? &fa->fa_list : fa_head));
1329
1330        rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1331        rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1332                  &cfg->fc_nlinfo, 0);
1333succeeded:
1334        return 0;
1335
1336out_free_new_fa:
1337        kmem_cache_free(fn_alias_kmem, new_fa);
1338out:
1339        fib_release_info(fi);
1340err:
1341        return err;
1342}
1343
1344/* should be called with rcu_read_lock */
1345static int check_leaf(struct trie *t, struct leaf *l,
1346                      t_key key,  const struct flowi *flp,
1347                      struct fib_result *res)
1348{
1349        struct leaf_info *li;
1350        struct hlist_head *hhead = &l->list;
1351        struct hlist_node *node;
1352
1353        hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1354                int err;
1355                int plen = li->plen;
1356                __be32 mask = inet_make_mask(plen);
1357
1358                if (l->key != (key & ntohl(mask)))
1359                        continue;
1360
1361                err = fib_semantic_match(&li->falh, flp, res, plen);
1362
1363#ifdef CONFIG_IP_FIB_TRIE_STATS
1364                if (err <= 0)
1365                        t->stats.semantic_match_passed++;
1366                else
1367                        t->stats.semantic_match_miss++;
1368#endif
1369                if (err <= 0)
1370                        return err;
1371        }
1372
1373        return 1;
1374}
1375
1376static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1377                          struct fib_result *res)
1378{
1379        struct trie *t = (struct trie *) tb->tb_data;
1380        int ret;
1381        struct node *n;
1382        struct tnode *pn;
1383        int pos, bits;
1384        t_key key = ntohl(flp->fl4_dst);
1385        int chopped_off;
1386        t_key cindex = 0;
1387        int current_prefix_length = KEYLENGTH;
1388        struct tnode *cn;
1389        t_key node_prefix, key_prefix, pref_mismatch;
1390        int mp;
1391
1392        rcu_read_lock();
1393
1394        n = rcu_dereference(t->trie);
1395        if (!n)
1396                goto failed;
1397
1398#ifdef CONFIG_IP_FIB_TRIE_STATS
1399        t->stats.gets++;
1400#endif
1401
1402        /* Just a leaf? */
1403        if (IS_LEAF(n)) {
1404                ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1405                goto found;
1406        }
1407
1408        pn = (struct tnode *) n;
1409        chopped_off = 0;
1410
1411        while (pn) {
1412                pos = pn->pos;
1413                bits = pn->bits;
1414
1415                if (!chopped_off)
1416                        cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1417                                                   pos, bits);
1418
1419                n = tnode_get_child_rcu(pn, cindex);
1420
1421                if (n == NULL) {
1422#ifdef CONFIG_IP_FIB_TRIE_STATS
1423                        t->stats.null_node_hit++;
1424#endif
1425                        goto backtrace;
1426                }
1427
1428                if (IS_LEAF(n)) {
1429                        ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1430                        if (ret > 0)
1431                                goto backtrace;
1432                        goto found;
1433                }
1434
1435                cn = (struct tnode *)n;
1436
1437                /*
1438                 * It's a tnode, and we can do some extra checks here if we
1439                 * like, to avoid descending into a dead-end branch.
1440                 * This tnode is in the parent's child array at index
1441                 * key[p_pos..p_pos+p_bits] but potentially with some bits
1442                 * chopped off, so in reality the index may be just a
1443                 * subprefix, padded with zero at the end.
1444                 * We can also take a look at any skipped bits in this
1445                 * tnode - everything up to p_pos is supposed to be ok,
1446                 * and the non-chopped bits of the index (se previous
1447                 * paragraph) are also guaranteed ok, but the rest is
1448                 * considered unknown.
1449                 *
1450                 * The skipped bits are key[pos+bits..cn->pos].
1451                 */
1452
1453                /* If current_prefix_length < pos+bits, we are already doing
1454                 * actual prefix  matching, which means everything from
1455                 * pos+(bits-chopped_off) onward must be zero along some
1456                 * branch of this subtree - otherwise there is *no* valid
1457                 * prefix present. Here we can only check the skipped
1458                 * bits. Remember, since we have already indexed into the
1459                 * parent's child array, we know that the bits we chopped of
1460                 * *are* zero.
1461                 */
1462
1463                /* NOTA BENE: Checking only skipped bits
1464                   for the new node here */
1465
1466                if (current_prefix_length < pos+bits) {
1467                        if (tkey_extract_bits(cn->key, current_prefix_length,
1468                                                cn->pos - current_prefix_length)
1469                            || !(cn->child[0]))
1470                                goto backtrace;
1471                }
1472
1473                /*
1474                 * If chopped_off=0, the index is fully validated and we
1475                 * only need to look at the skipped bits for this, the new,
1476                 * tnode. What we actually want to do is to find out if
1477                 * these skipped bits match our key perfectly, or if we will
1478                 * have to count on finding a matching prefix further down,
1479                 * because if we do, we would like to have some way of
1480                 * verifying the existence of such a prefix at this point.
1481                 */
1482
1483                /* The only thing we can do at this point is to verify that
1484                 * any such matching prefix can indeed be a prefix to our
1485                 * key, and if the bits in the node we are inspecting that
1486                 * do not match our key are not ZERO, this cannot be true.
1487                 * Thus, find out where there is a mismatch (before cn->pos)
1488                 * and verify that all the mismatching bits are zero in the
1489                 * new tnode's key.
1490                 */
1491
1492                /*
1493                 * Note: We aren't very concerned about the piece of
1494                 * the key that precede pn->pos+pn->bits, since these
1495                 * have already been checked. The bits after cn->pos
1496                 * aren't checked since these are by definition
1497                 * "unknown" at this point. Thus, what we want to see
1498                 * is if we are about to enter the "prefix matching"
1499                 * state, and in that case verify that the skipped
1500                 * bits that will prevail throughout this subtree are
1501                 * zero, as they have to be if we are to find a
1502                 * matching prefix.
1503                 */
1504
1505                node_prefix = mask_pfx(cn->key, cn->pos);
1506                key_prefix = mask_pfx(key, cn->pos);
1507                pref_mismatch = key_prefix^node_prefix;
1508                mp = 0;
1509
1510                /*
1511                 * In short: If skipped bits in this node do not match
1512                 * the search key, enter the "prefix matching"
1513                 * state.directly.
1514                 */
1515                if (pref_mismatch) {
1516                        while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1517                                mp++;
1518                                pref_mismatch = pref_mismatch << 1;
1519                        }
1520                        key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1521
1522                        if (key_prefix != 0)
1523                                goto backtrace;
1524
1525                        if (current_prefix_length >= cn->pos)
1526                                current_prefix_length = mp;
1527                }
1528
1529                pn = (struct tnode *)n; /* Descend */
1530                chopped_off = 0;
1531                continue;
1532
1533backtrace:
1534                chopped_off++;
1535
1536                /* As zero don't change the child key (cindex) */
1537                while ((chopped_off <= pn->bits)
1538                       && !(cindex & (1<<(chopped_off-1))))
1539                        chopped_off++;
1540
1541                /* Decrease current_... with bits chopped off */
1542                if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1543                        current_prefix_length = pn->pos + pn->bits
1544                                - chopped_off;
1545
1546                /*
1547                 * Either we do the actual chop off according or if we have
1548                 * chopped off all bits in this tnode walk up to our parent.
1549                 */
1550
1551                if (chopped_off <= pn->bits) {
1552                        cindex &= ~(1 << (chopped_off-1));
1553                } else {
1554                        struct tnode *parent = node_parent_rcu((struct node *) pn);
1555                        if (!parent)
1556                                goto failed;
1557
1558                        /* Get Child's index */
1559                        cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1560                        pn = parent;
1561                        chopped_off = 0;
1562
1563#ifdef CONFIG_IP_FIB_TRIE_STATS
1564                        t->stats.backtrack++;
1565#endif
1566                        goto backtrace;
1567                }
1568        }
1569failed:
1570        ret = 1;
1571found:
1572        rcu_read_unlock();
1573        return ret;
1574}
1575
1576/*
1577 * Remove the leaf and return parent.
1578 */
1579static void trie_leaf_remove(struct trie *t, struct leaf *l)
1580{
1581        struct tnode *tp = node_parent((struct node *) l);
1582
1583        pr_debug("entering trie_leaf_remove(%p)\n", l);
1584
1585        if (tp) {
1586                t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1587                put_child(t, (struct tnode *)tp, cindex, NULL);
1588                trie_rebalance(t, tp);
1589        } else
1590                rcu_assign_pointer(t->trie, NULL);
1591
1592        free_leaf(l);
1593}
1594
1595/*
1596 * Caller must hold RTNL.
1597 */
1598static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1599{
1600        struct trie *t = (struct trie *) tb->tb_data;
1601        u32 key, mask;
1602        int plen = cfg->fc_dst_len;
1603        u8 tos = cfg->fc_tos;
1604        struct fib_alias *fa, *fa_to_delete;
1605        struct list_head *fa_head;
1606        struct leaf *l;
1607        struct leaf_info *li;
1608
1609        if (plen > 32)
1610                return -EINVAL;
1611
1612        key = ntohl(cfg->fc_dst);
1613        mask = ntohl(inet_make_mask(plen));
1614
1615        if (key & ~mask)
1616                return -EINVAL;
1617
1618        key = key & mask;
1619        l = fib_find_node(t, key);
1620
1621        if (!l)
1622                return -ESRCH;
1623
1624        fa_head = get_fa_head(l, plen);
1625        fa = fib_find_alias(fa_head, tos, 0);
1626
1627        if (!fa)
1628                return -ESRCH;
1629
1630        pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1631
1632        fa_to_delete = NULL;
1633        fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1634        list_for_each_entry_continue(fa, fa_head, fa_list) {
1635                struct fib_info *fi = fa->fa_info;
1636
1637                if (fa->fa_tos != tos)
1638                        break;
1639
1640                if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1641                    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1642                     fa->fa_scope == cfg->fc_scope) &&
1643                    (!cfg->fc_protocol ||
1644                     fi->fib_protocol == cfg->fc_protocol) &&
1645                    fib_nh_match(cfg, fi) == 0) {
1646                        fa_to_delete = fa;
1647                        break;
1648                }
1649        }
1650
1651        if (!fa_to_delete)
1652                return -ESRCH;
1653
1654        fa = fa_to_delete;
1655        rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1656                  &cfg->fc_nlinfo, 0);
1657
1658        l = fib_find_node(t, key);
1659        li = find_leaf_info(l, plen);
1660
1661        list_del_rcu(&fa->fa_list);
1662
1663        if (list_empty(fa_head)) {
1664                hlist_del_rcu(&li->hlist);
1665                free_leaf_info(li);
1666        }
1667
1668        if (hlist_empty(&l->list))
1669                trie_leaf_remove(t, l);
1670
1671        if (fa->fa_state & FA_S_ACCESSED)
1672                rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1673
1674        fib_release_info(fa->fa_info);
1675        alias_free_mem_rcu(fa);
1676        return 0;
1677}
1678
1679static int trie_flush_list(struct list_head *head)
1680{
1681        struct fib_alias *fa, *fa_node;
1682        int found = 0;
1683
1684        list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1685                struct fib_info *fi = fa->fa_info;
1686
1687                if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1688                        list_del_rcu(&fa->fa_list);
1689                        fib_release_info(fa->fa_info);
1690                        alias_free_mem_rcu(fa);
1691                        found++;
1692                }
1693        }
1694        return found;
1695}
1696
1697static int trie_flush_leaf(struct leaf *l)
1698{
1699        int found = 0;
1700        struct hlist_head *lih = &l->list;
1701        struct hlist_node *node, *tmp;
1702        struct leaf_info *li = NULL;
1703
1704        hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1705                found += trie_flush_list(&li->falh);
1706
1707                if (list_empty(&li->falh)) {
1708                        hlist_del_rcu(&li->hlist);
1709                        free_leaf_info(li);
1710                }
1711        }
1712        return found;
1713}
1714
1715/*
1716 * Scan for the next right leaf starting at node p->child[idx]
1717 * Since we have back pointer, no recursion necessary.
1718 */
1719static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1720{
1721        do {
1722                t_key idx;
1723
1724                if (c)
1725                        idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1726                else
1727                        idx = 0;
1728
1729                while (idx < 1u << p->bits) {
1730                        c = tnode_get_child_rcu(p, idx++);
1731                        if (!c)
1732                                continue;
1733
1734                        if (IS_LEAF(c)) {
1735                                prefetch(p->child[idx]);
1736                                return (struct leaf *) c;
1737                        }
1738
1739                        /* Rescan start scanning in new node */
1740                        p = (struct tnode *) c;
1741                        idx = 0;
1742                }
1743
1744                /* Node empty, walk back up to parent */
1745                c = (struct node *) p;
1746        } while ( (p = node_parent_rcu(c)) != NULL);
1747
1748        return NULL; /* Root of trie */
1749}
1750
1751static struct leaf *trie_firstleaf(struct trie *t)
1752{
1753        struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
1754
1755        if (!n)
1756                return NULL;
1757
1758        if (IS_LEAF(n))          /* trie is just a leaf */
1759                return (struct leaf *) n;
1760
1761        return leaf_walk_rcu(n, NULL);
1762}
1763
1764static struct leaf *trie_nextleaf(struct leaf *l)
1765{
1766        struct node *c = (struct node *) l;
1767        struct tnode *p = node_parent_rcu(c);
1768
1769        if (!p)
1770                return NULL;    /* trie with just one leaf */
1771
1772        return leaf_walk_rcu(p, c);
1773}
1774
1775static struct leaf *trie_leafindex(struct trie *t, int index)
1776{
1777        struct leaf *l = trie_firstleaf(t);
1778
1779        while (l && index-- > 0)
1780                l = trie_nextleaf(l);
1781
1782        return l;
1783}
1784
1785
1786/*
1787 * Caller must hold RTNL.
1788 */
1789static int fn_trie_flush(struct fib_table *tb)
1790{
1791        struct trie *t = (struct trie *) tb->tb_data;
1792        struct leaf *l, *ll = NULL;
1793        int found = 0;
1794
1795        for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1796                found += trie_flush_leaf(l);
1797
1798                if (ll && hlist_empty(&ll->list))
1799                        trie_leaf_remove(t, ll);
1800                ll = l;
1801        }
1802
1803        if (ll && hlist_empty(&ll->list))
1804                trie_leaf_remove(t, ll);
1805
1806        pr_debug("trie_flush found=%d\n", found);
1807        return found;
1808}
1809
1810static void fn_trie_select_default(struct fib_table *tb,
1811                                   const struct flowi *flp,
1812                                   struct fib_result *res)
1813{
1814        struct trie *t = (struct trie *) tb->tb_data;
1815        int order, last_idx;
1816        struct fib_info *fi = NULL;
1817        struct fib_info *last_resort;
1818        struct fib_alias *fa = NULL;
1819        struct list_head *fa_head;
1820        struct leaf *l;
1821
1822        last_idx = -1;
1823        last_resort = NULL;
1824        order = -1;
1825
1826        rcu_read_lock();
1827
1828        l = fib_find_node(t, 0);
1829        if (!l)
1830                goto out;
1831
1832        fa_head = get_fa_head(l, 0);
1833        if (!fa_head)
1834                goto out;
1835
1836        if (list_empty(fa_head))
1837                goto out;
1838
1839        list_for_each_entry_rcu(fa, fa_head, fa_list) {
1840                struct fib_info *next_fi = fa->fa_info;
1841
1842                if (fa->fa_scope != res->scope ||
1843                    fa->fa_type != RTN_UNICAST)
1844                        continue;
1845
1846                if (next_fi->fib_priority > res->fi->fib_priority)
1847                        break;
1848                if (!next_fi->fib_nh[0].nh_gw ||
1849                    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1850                        continue;
1851                fa->fa_state |= FA_S_ACCESSED;
1852
1853                if (fi == NULL) {
1854                        if (next_fi != res->fi)
1855                                break;
1856                } else if (!fib_detect_death(fi, order, &last_resort,
1857                                             &last_idx, tb->tb_default)) {
1858                        fib_result_assign(res, fi);
1859                        tb->tb_default = order;
1860                        goto out;
1861                }
1862                fi = next_fi;
1863                order++;
1864        }
1865        if (order <= 0 || fi == NULL) {
1866                tb->tb_default = -1;
1867                goto out;
1868        }
1869
1870        if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1871                                tb->tb_default)) {
1872                fib_result_assign(res, fi);
1873                tb->tb_default = order;
1874                goto out;
1875        }
1876        if (last_idx >= 0)
1877                fib_result_assign(res, last_resort);
1878        tb->tb_default = last_idx;
1879out:
1880        rcu_read_unlock();
1881}
1882
1883static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1884                           struct fib_table *tb,
1885                           struct sk_buff *skb, struct netlink_callback *cb)
1886{
1887        int i, s_i;
1888        struct fib_alias *fa;
1889        __be32 xkey = htonl(key);
1890
1891        s_i = cb->args[5];
1892        i = 0;
1893
1894        /* rcu_read_lock is hold by caller */
1895
1896        list_for_each_entry_rcu(fa, fah, fa_list) {
1897                if (i < s_i) {
1898                        i++;
1899                        continue;
1900                }
1901
1902                if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1903                                  cb->nlh->nlmsg_seq,
1904                                  RTM_NEWROUTE,
1905                                  tb->tb_id,
1906                                  fa->fa_type,
1907                                  fa->fa_scope,
1908                                  xkey,
1909                                  plen,
1910                                  fa->fa_tos,
1911                                  fa->fa_info, NLM_F_MULTI) < 0) {
1912                        cb->args[5] = i;
1913                        return -1;
1914                }
1915                i++;
1916        }
1917        cb->args[5] = i;
1918        return skb->len;
1919}
1920
1921static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1922                        struct sk_buff *skb, struct netlink_callback *cb)
1923{
1924        struct leaf_info *li;
1925        struct hlist_node *node;
1926        int i, s_i;
1927
1928        s_i = cb->args[4];
1929        i = 0;
1930
1931        /* rcu_read_lock is hold by caller */
1932        hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1933                if (i < s_i) {
1934                        i++;
1935                        continue;
1936                }
1937
1938                if (i > s_i)
1939                        cb->args[5] = 0;
1940
1941                if (list_empty(&li->falh))
1942                        continue;
1943
1944                if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1945                        cb->args[4] = i;
1946                        return -1;
1947                }
1948                i++;
1949        }
1950
1951        cb->args[4] = i;
1952        return skb->len;
1953}
1954
1955static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
1956                        struct netlink_callback *cb)
1957{
1958        struct leaf *l;
1959        struct trie *t = (struct trie *) tb->tb_data;
1960        t_key key = cb->args[2];
1961        int count = cb->args[3];
1962
1963        rcu_read_lock();
1964        /* Dump starting at last key.
1965         * Note: 0.0.0.0/0 (ie default) is first key.
1966         */
1967        if (count == 0)
1968                l = trie_firstleaf(t);
1969        else {
1970                /* Normally, continue from last key, but if that is missing
1971                 * fallback to using slow rescan
1972                 */
1973                l = fib_find_node(t, key);
1974                if (!l)
1975                        l = trie_leafindex(t, count);
1976        }
1977
1978        while (l) {
1979                cb->args[2] = l->key;
1980                if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1981                        cb->args[3] = count;
1982                        rcu_read_unlock();
1983                        return -1;
1984                }
1985
1986                ++count;
1987                l = trie_nextleaf(l);
1988                memset(&cb->args[4], 0,
1989                       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1990        }
1991        cb->args[3] = count;
1992        rcu_read_unlock();
1993
1994        return skb->len;
1995}
1996
1997void __init fib_hash_init(void)
1998{
1999        fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2000                                          sizeof(struct fib_alias),
2001                                          0, SLAB_PANIC, NULL);
2002
2003        trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
2004                                           max(sizeof(struct leaf),
2005                                               sizeof(struct leaf_info)),
2006                                           0, SLAB_PANIC, NULL);
2007}
2008
2009
2010/* Fix more generic FIB names for init later */
2011struct fib_table *fib_hash_table(u32 id)
2012{
2013        struct fib_table *tb;
2014        struct trie *t;
2015
2016        tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2017                     GFP_KERNEL);
2018        if (tb == NULL)
2019                return NULL;
2020
2021        tb->tb_id = id;
2022        tb->tb_default = -1;
2023        tb->tb_lookup = fn_trie_lookup;
2024        tb->tb_insert = fn_trie_insert;
2025        tb->tb_delete = fn_trie_delete;
2026        tb->tb_flush = fn_trie_flush;
2027        tb->tb_select_default = fn_trie_select_default;
2028        tb->tb_dump = fn_trie_dump;
2029
2030        t = (struct trie *) tb->tb_data;
2031        memset(t, 0, sizeof(*t));
2032
2033        if (id == RT_TABLE_LOCAL)
2034                pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2035
2036        return tb;
2037}
2038
2039#ifdef CONFIG_PROC_FS
2040/* Depth first Trie walk iterator */
2041struct fib_trie_iter {
2042        struct seq_net_private p;
2043        struct fib_table *tb;
2044        struct tnode *tnode;
2045        unsigned index;
2046        unsigned depth;
2047};
2048
2049static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2050{
2051        struct tnode *tn = iter->tnode;
2052        unsigned cindex = iter->index;
2053        struct tnode *p;
2054
2055        /* A single entry routing table */
2056        if (!tn)
2057                return NULL;
2058
2059        pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2060                 iter->tnode, iter->index, iter->depth);
2061rescan:
2062        while (cindex < (1<<tn->bits)) {
2063                struct node *n = tnode_get_child_rcu(tn, cindex);
2064
2065                if (n) {
2066                        if (IS_LEAF(n)) {
2067                                iter->tnode = tn;
2068                                iter->index = cindex + 1;
2069                        } else {
2070                                /* push down one level */
2071                                iter->tnode = (struct tnode *) n;
2072                                iter->index = 0;
2073                                ++iter->depth;
2074                        }
2075                        return n;
2076                }
2077
2078                ++cindex;
2079        }
2080
2081        /* Current node exhausted, pop back up */
2082        p = node_parent_rcu((struct node *)tn);
2083        if (p) {
2084                cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2085                tn = p;
2086                --iter->depth;
2087                goto rescan;
2088        }
2089
2090        /* got root? */
2091        return NULL;
2092}
2093
2094static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2095                                       struct trie *t)
2096{
2097        struct node *n;
2098
2099        if (!t)
2100                return NULL;
2101
2102        n = rcu_dereference(t->trie);
2103        if (!n)
2104                return NULL;
2105
2106        if (IS_TNODE(n)) {
2107                iter->tnode = (struct tnode *) n;
2108                iter->index = 0;
2109                iter->depth = 1;
2110        } else {
2111                iter->tnode = NULL;
2112                iter->index = 0;
2113                iter->depth = 0;
2114        }
2115
2116        return n;
2117}
2118
2119static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2120{
2121        struct node *n;
2122        struct fib_trie_iter iter;
2123
2124        memset(s, 0, sizeof(*s));
2125
2126        rcu_read_lock();
2127        for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2128                if (IS_LEAF(n)) {
2129                        struct leaf *l = (struct leaf *)n;
2130                        struct leaf_info *li;
2131                        struct hlist_node *tmp;
2132
2133                        s->leaves++;
2134                        s->totdepth += iter.depth;
2135                        if (iter.depth > s->maxdepth)
2136                                s->maxdepth = iter.depth;
2137
2138                        hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2139                                ++s->prefixes;
2140                } else {
2141                        const struct tnode *tn = (const struct tnode *) n;
2142                        int i;
2143
2144                        s->tnodes++;
2145                        if (tn->bits < MAX_STAT_DEPTH)
2146                                s->nodesizes[tn->bits]++;
2147
2148                        for (i = 0; i < (1<<tn->bits); i++)
2149                                if (!tn->child[i])
2150                                        s->nullpointers++;
2151                }
2152        }
2153        rcu_read_unlock();
2154}
2155
2156/*
2157 *      This outputs /proc/net/fib_triestats
2158 */
2159static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2160{
2161        unsigned i, max, pointers, bytes, avdepth;
2162
2163        if (stat->leaves)
2164                avdepth = stat->totdepth*100 / stat->leaves;
2165        else
2166                avdepth = 0;
2167
2168        seq_printf(seq, "\tAver depth:     %u.%02d\n",
2169                   avdepth / 100, avdepth % 100);
2170        seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2171
2172        seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2173        bytes = sizeof(struct leaf) * stat->leaves;
2174
2175        seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2176        bytes += sizeof(struct leaf_info) * stat->prefixes;
2177
2178        seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2179        bytes += sizeof(struct tnode) * stat->tnodes;
2180
2181        max = MAX_STAT_DEPTH;
2182        while (max > 0 && stat->nodesizes[max-1] == 0)
2183                max--;
2184
2185        pointers = 0;
2186        for (i = 1; i <= max; i++)
2187                if (stat->nodesizes[i] != 0) {
2188                        seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2189                        pointers += (1<<i) * stat->nodesizes[i];
2190                }
2191        seq_putc(seq, '\n');
2192        seq_printf(seq, "\tPointers: %u\n", pointers);
2193
2194        bytes += sizeof(struct node *) * pointers;
2195        seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2196        seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2197}
2198
2199#ifdef CONFIG_IP_FIB_TRIE_STATS
2200static void trie_show_usage(struct seq_file *seq,
2201                            const struct trie_use_stats *stats)
2202{
2203        seq_printf(seq, "\nCounters:\n---------\n");
2204        seq_printf(seq, "gets = %u\n", stats->gets);
2205        seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2206        seq_printf(seq, "semantic match passed = %u\n",
2207                   stats->semantic_match_passed);
2208        seq_printf(seq, "semantic match miss = %u\n",
2209                   stats->semantic_match_miss);
2210        seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2211        seq_printf(seq, "skipped node resize = %u\n\n",
2212                   stats->resize_node_skipped);
2213}
2214#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2215
2216static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2217{
2218        if (tb->tb_id == RT_TABLE_LOCAL)
2219                seq_puts(seq, "Local:\n");
2220        else if (tb->tb_id == RT_TABLE_MAIN)
2221                seq_puts(seq, "Main:\n");
2222        else
2223                seq_printf(seq, "Id %d:\n", tb->tb_id);
2224}
2225
2226
2227static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2228{
2229        struct net *net = (struct net *)seq->private;
2230        unsigned int h;
2231
2232        seq_printf(seq,
2233                   "Basic info: size of leaf:"
2234                   " %Zd bytes, size of tnode: %Zd bytes.\n",
2235                   sizeof(struct leaf), sizeof(struct tnode));
2236
2237        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2238                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2239                struct hlist_node *node;
2240                struct fib_table *tb;
2241
2242                hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2243                        struct trie *t = (struct trie *) tb->tb_data;
2244                        struct trie_stat stat;
2245
2246                        if (!t)
2247                                continue;
2248
2249                        fib_table_print(seq, tb);
2250
2251                        trie_collect_stats(t, &stat);
2252                        trie_show_stats(seq, &stat);
2253#ifdef CONFIG_IP_FIB_TRIE_STATS
2254                        trie_show_usage(seq, &t->stats);
2255#endif
2256                }
2257        }
2258
2259        return 0;
2260}
2261
2262static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2263{
2264        return single_open_net(inode, file, fib_triestat_seq_show);
2265}
2266
2267static const struct file_operations fib_triestat_fops = {
2268        .owner  = THIS_MODULE,
2269        .open   = fib_triestat_seq_open,
2270        .read   = seq_read,
2271        .llseek = seq_lseek,
2272        .release = single_release_net,
2273};
2274
2275static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2276{
2277        struct fib_trie_iter *iter = seq->private;
2278        struct net *net = seq_file_net(seq);
2279        loff_t idx = 0;
2280        unsigned int h;
2281
2282        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2283                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2284                struct hlist_node *node;
2285                struct fib_table *tb;
2286
2287                hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2288                        struct node *n;
2289
2290                        for (n = fib_trie_get_first(iter,
2291                                                    (struct trie *) tb->tb_data);
2292                             n; n = fib_trie_get_next(iter))
2293                                if (pos == idx++) {
2294                                        iter->tb = tb;
2295                                        return n;
2296                                }
2297                }
2298        }
2299
2300        return NULL;
2301}
2302
2303static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2304        __acquires(RCU)
2305{
2306        rcu_read_lock();
2307        return fib_trie_get_idx(seq, *pos);
2308}
2309
2310static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2311{
2312        struct fib_trie_iter *iter = seq->private;
2313        struct net *net = seq_file_net(seq);
2314        struct fib_table *tb = iter->tb;
2315        struct hlist_node *tb_node;
2316        unsigned int h;
2317        struct node *n;
2318
2319        ++*pos;
2320        /* next node in same table */
2321        n = fib_trie_get_next(iter);
2322        if (n)
2323                return n;
2324
2325        /* walk rest of this hash chain */
2326        h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2327        while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
2328                tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2329                n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2330                if (n)
2331                        goto found;
2332        }
2333
2334        /* new hash chain */
2335        while (++h < FIB_TABLE_HASHSZ) {
2336                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2337                hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2338                        n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2339                        if (n)
2340                                goto found;
2341                }
2342        }
2343        return NULL;
2344
2345found:
2346        iter->tb = tb;
2347        return n;
2348}
2349
2350static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2351        __releases(RCU)
2352{
2353        rcu_read_unlock();
2354}
2355
2356static void seq_indent(struct seq_file *seq, int n)
2357{
2358        while (n-- > 0) seq_puts(seq, "   ");
2359}
2360
2361static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2362{
2363        switch (s) {
2364        case RT_SCOPE_UNIVERSE: return "universe";
2365        case RT_SCOPE_SITE:     return "site";
2366        case RT_SCOPE_LINK:     return "link";
2367        case RT_SCOPE_HOST:     return "host";
2368        case RT_SCOPE_NOWHERE:  return "nowhere";
2369        default:
2370                snprintf(buf, len, "scope=%d", s);
2371                return buf;
2372        }
2373}
2374
2375static const char *const rtn_type_names[__RTN_MAX] = {
2376        [RTN_UNSPEC] = "UNSPEC",
2377        [RTN_UNICAST] = "UNICAST",
2378        [RTN_LOCAL] = "LOCAL",
2379        [RTN_BROADCAST] = "BROADCAST",
2380        [RTN_ANYCAST] = "ANYCAST",
2381        [RTN_MULTICAST] = "MULTICAST",
2382        [RTN_BLACKHOLE] = "BLACKHOLE",
2383        [RTN_UNREACHABLE] = "UNREACHABLE",
2384        [RTN_PROHIBIT] = "PROHIBIT",
2385        [RTN_THROW] = "THROW",
2386        [RTN_NAT] = "NAT",
2387        [RTN_XRESOLVE] = "XRESOLVE",
2388};
2389
2390static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2391{
2392        if (t < __RTN_MAX && rtn_type_names[t])
2393                return rtn_type_names[t];
2394        snprintf(buf, len, "type %u", t);
2395        return buf;
2396}
2397
2398/* Pretty print the trie */
2399static int fib_trie_seq_show(struct seq_file *seq, void *v)
2400{
2401        const struct fib_trie_iter *iter = seq->private;
2402        struct node *n = v;
2403
2404        if (!node_parent_rcu(n))
2405                fib_table_print(seq, iter->tb);
2406
2407        if (IS_TNODE(n)) {
2408                struct tnode *tn = (struct tnode *) n;
2409                __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2410
2411                seq_indent(seq, iter->depth-1);
2412                seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
2413                           &prf, tn->pos, tn->bits, tn->full_children,
2414                           tn->empty_children);
2415
2416        } else {
2417                struct leaf *l = (struct leaf *) n;
2418                struct leaf_info *li;
2419                struct hlist_node *node;
2420                __be32 val = htonl(l->key);
2421
2422                seq_indent(seq, iter->depth);
2423                seq_printf(seq, "  |-- %pI4\n", &val);
2424
2425                hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2426                        struct fib_alias *fa;
2427
2428                        list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2429                                char buf1[32], buf2[32];
2430
2431                                seq_indent(seq, iter->depth+1);
2432                                seq_printf(seq, "  /%d %s %s", li->plen,
2433                                           rtn_scope(buf1, sizeof(buf1),
2434                                                     fa->fa_scope),
2435                                           rtn_type(buf2, sizeof(buf2),
2436                                                    fa->fa_type));
2437                                if (fa->fa_tos)
2438                                        seq_printf(seq, " tos=%d", fa->fa_tos);
2439                                seq_putc(seq, '\n');
2440                        }
2441                }
2442        }
2443
2444        return 0;
2445}
2446
2447static const struct seq_operations fib_trie_seq_ops = {
2448        .start  = fib_trie_seq_start,
2449        .next   = fib_trie_seq_next,
2450        .stop   = fib_trie_seq_stop,
2451        .show   = fib_trie_seq_show,
2452};
2453
2454static int fib_trie_seq_open(struct inode *inode, struct file *file)
2455{
2456        return seq_open_net(inode, file, &fib_trie_seq_ops,
2457                            sizeof(struct fib_trie_iter));
2458}
2459
2460static const struct file_operations fib_trie_fops = {
2461        .owner  = THIS_MODULE,
2462        .open   = fib_trie_seq_open,
2463        .read   = seq_read,
2464        .llseek = seq_lseek,
2465        .release = seq_release_net,
2466};
2467
2468struct fib_route_iter {
2469        struct seq_net_private p;
2470        struct trie *main_trie;
2471        loff_t  pos;
2472        t_key   key;
2473};
2474
2475static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2476{
2477        struct leaf *l = NULL;
2478        struct trie *t = iter->main_trie;
2479
2480        /* use cache location of last found key */
2481        if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2482                pos -= iter->pos;
2483        else {
2484                iter->pos = 0;
2485                l = trie_firstleaf(t);
2486        }
2487
2488        while (l && pos-- > 0) {
2489                iter->pos++;
2490                l = trie_nextleaf(l);
2491        }
2492
2493        if (l)
2494                iter->key = pos;        /* remember it */
2495        else
2496                iter->pos = 0;          /* forget it */
2497
2498        return l;
2499}
2500
2501static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2502        __acquires(RCU)
2503{
2504        struct fib_route_iter *iter = seq->private;
2505        struct fib_table *tb;
2506
2507        rcu_read_lock();
2508        tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2509        if (!tb)
2510                return NULL;
2511
2512        iter->main_trie = (struct trie *) tb->tb_data;
2513        if (*pos == 0)
2514                return SEQ_START_TOKEN;
2515        else
2516                return fib_route_get_idx(iter, *pos - 1);
2517}
2518
2519static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2520{
2521        struct fib_route_iter *iter = seq->private;
2522        struct leaf *l = v;
2523
2524        ++*pos;
2525        if (v == SEQ_START_TOKEN) {
2526                iter->pos = 0;
2527                l = trie_firstleaf(iter->main_trie);
2528        } else {
2529                iter->pos++;
2530                l = trie_nextleaf(l);
2531        }
2532
2533        if (l)
2534                iter->key = l->key;
2535        else
2536                iter->pos = 0;
2537        return l;
2538}
2539
2540static void fib_route_seq_stop(struct seq_file *seq, void *v)
2541        __releases(RCU)
2542{
2543        rcu_read_unlock();
2544}
2545
2546static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2547{
2548        static unsigned type2flags[RTN_MAX + 1] = {
2549                [7] = RTF_REJECT, [8] = RTF_REJECT,
2550        };
2551        unsigned flags = type2flags[type];
2552
2553        if (fi && fi->fib_nh->nh_gw)
2554                flags |= RTF_GATEWAY;
2555        if (mask == htonl(0xFFFFFFFF))
2556                flags |= RTF_HOST;
2557        flags |= RTF_UP;
2558        return flags;
2559}
2560
2561/*
2562 *      This outputs /proc/net/route.
2563 *      The format of the file is not supposed to be changed
2564 *      and needs to be same as fib_hash output to avoid breaking
2565 *      legacy utilities
2566 */
2567static int fib_route_seq_show(struct seq_file *seq, void *v)
2568{
2569        struct leaf *l = v;
2570        struct leaf_info *li;
2571        struct hlist_node *node;
2572
2573        if (v == SEQ_START_TOKEN) {
2574                seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2575                           "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2576                           "\tWindow\tIRTT");
2577                return 0;
2578        }
2579
2580        hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2581                struct fib_alias *fa;
2582                __be32 mask, prefix;
2583
2584                mask = inet_make_mask(li->plen);
2585                prefix = htonl(l->key);
2586
2587                list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2588                        const struct fib_info *fi = fa->fa_info;
2589                        unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2590                        int len;
2591
2592                        if (fa->fa_type == RTN_BROADCAST
2593                            || fa->fa_type == RTN_MULTICAST)
2594                                continue;
2595
2596                        if (fi)
2597                                seq_printf(seq,
2598                                         "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2599                                         "%d\t%08X\t%d\t%u\t%u%n",
2600                                         fi->fib_dev ? fi->fib_dev->name : "*",
2601                                         prefix,
2602                                         fi->fib_nh->nh_gw, flags, 0, 0,
2603                                         fi->fib_priority,
2604                                         mask,
2605                                         (fi->fib_advmss ?
2606                                          fi->fib_advmss + 40 : 0),
2607                                         fi->fib_window,
2608                                         fi->fib_rtt >> 3, &len);
2609                        else
2610                                seq_printf(seq,
2611                                         "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2612                                         "%d\t%08X\t%d\t%u\t%u%n",
2613                                         prefix, 0, flags, 0, 0, 0,
2614                                         mask, 0, 0, 0, &len);
2615
2616                        seq_printf(seq, "%*s\n", 127 - len, "");
2617                }
2618        }
2619
2620        return 0;
2621}
2622
2623static const struct seq_operations fib_route_seq_ops = {
2624        .start  = fib_route_seq_start,
2625        .next   = fib_route_seq_next,
2626        .stop   = fib_route_seq_stop,
2627        .show   = fib_route_seq_show,
2628};
2629
2630static int fib_route_seq_open(struct inode *inode, struct file *file)
2631{
2632        return seq_open_net(inode, file, &fib_route_seq_ops,
2633                            sizeof(struct fib_route_iter));
2634}
2635
2636static const struct file_operations fib_route_fops = {
2637        .owner  = THIS_MODULE,
2638        .open   = fib_route_seq_open,
2639        .read   = seq_read,
2640        .llseek = seq_lseek,
2641        .release = seq_release_net,
2642};
2643
2644int __net_init fib_proc_init(struct net *net)
2645{
2646        if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2647                goto out1;
2648
2649        if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2650                                  &fib_triestat_fops))
2651                goto out2;
2652
2653        if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2654                goto out3;
2655
2656        return 0;
2657
2658out3:
2659        proc_net_remove(net, "fib_triestat");
2660out2:
2661        proc_net_remove(net, "fib_trie");
2662out1:
2663        return -ENOMEM;
2664}
2665
2666void __net_exit fib_proc_exit(struct net *net)
2667{
2668        proc_net_remove(net, "fib_trie");
2669        proc_net_remove(net, "fib_triestat");
2670        proc_net_remove(net, "route");
2671}
2672
2673#endif /* CONFIG_PROC_FS */
2674
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.