linux/net/ipv4/fib_trie.c
<<
>>
Prefs
   1/*
   2 *   This program is free software; you can redistribute it and/or
   3 *   modify it under the terms of the GNU General Public License
   4 *   as published by the Free Software Foundation; either version
   5 *   2 of the License, or (at your option) any later version.
   6 *
   7 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
   8 *     & Swedish University of Agricultural Sciences.
   9 *
  10 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
  11 *     Agricultural Sciences.
  12 *
  13 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
  14 *
  15 * This work is based on the LPC-trie which is originally descibed in:
  16 *
  17 * An experimental study of compression methods for dynamic tries
  18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
  19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
  20 *
  21 *
  22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
  23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
  24 *
  25 *
  26 * Code from fib_hash has been reused which includes the following header:
  27 *
  28 *
  29 * INET         An implementation of the TCP/IP protocol suite for the LINUX
  30 *              operating system.  INET is implemented using the  BSD Socket
  31 *              interface as the means of communication with the user level.
  32 *
  33 *              IPv4 FIB: lookup engine and maintenance routines.
  34 *
  35 *
  36 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  37 *
  38 *              This program is free software; you can redistribute it and/or
  39 *              modify it under the terms of the GNU General Public License
  40 *              as published by the Free Software Foundation; either version
  41 *              2 of the License, or (at your option) any later version.
  42 *
  43 * Substantial contributions to this work comes from:
  44 *
  45 *              David S. Miller, <davem@davemloft.net>
  46 *              Stephen Hemminger <shemminger@osdl.org>
  47 *              Paul E. McKenney <paulmck@us.ibm.com>
  48 *              Patrick McHardy <kaber@trash.net>
  49 */
  50
  51#define VERSION "0.409"
  52
  53#include <asm/uaccess.h>
  54#include <asm/system.h>
  55#include <linux/bitops.h>
  56#include <linux/types.h>
  57#include <linux/kernel.h>
  58#include <linux/mm.h>
  59#include <linux/string.h>
  60#include <linux/socket.h>
  61#include <linux/sockios.h>
  62#include <linux/errno.h>
  63#include <linux/in.h>
  64#include <linux/inet.h>
  65#include <linux/inetdevice.h>
  66#include <linux/netdevice.h>
  67#include <linux/if_arp.h>
  68#include <linux/proc_fs.h>
  69#include <linux/rcupdate.h>
  70#include <linux/skbuff.h>
  71#include <linux/netlink.h>
  72#include <linux/init.h>
  73#include <linux/list.h>
  74#include <net/net_namespace.h>
  75#include <net/ip.h>
  76#include <net/protocol.h>
  77#include <net/route.h>
  78#include <net/tcp.h>
  79#include <net/sock.h>
  80#include <net/ip_fib.h>
  81#include "fib_lookup.h"
  82
  83#define MAX_STAT_DEPTH 32
  84
  85#define KEYLENGTH (8*sizeof(t_key))
  86
  87typedef unsigned int t_key;
  88
  89#define T_TNODE 0
  90#define T_LEAF  1
  91#define NODE_TYPE_MASK  0x1UL
  92#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
  93
  94#define IS_TNODE(n) (!(n->parent & T_LEAF))
  95#define IS_LEAF(n) (n->parent & T_LEAF)
  96
  97struct node {
  98        unsigned long parent;
  99        t_key key;
 100};
 101
 102struct leaf {
 103        unsigned long parent;
 104        t_key key;
 105        struct hlist_head list;
 106        struct rcu_head rcu;
 107};
 108
 109struct leaf_info {
 110        struct hlist_node hlist;
 111        struct rcu_head rcu;
 112        int plen;
 113        struct list_head falh;
 114};
 115
 116struct tnode {
 117        unsigned long parent;
 118        t_key key;
 119        unsigned char pos;              /* 2log(KEYLENGTH) bits needed */
 120        unsigned char bits;             /* 2log(KEYLENGTH) bits needed */
 121        unsigned int full_children;     /* KEYLENGTH bits needed */
 122        unsigned int empty_children;    /* KEYLENGTH bits needed */
 123        union {
 124                struct rcu_head rcu;
 125                struct work_struct work;
 126                struct tnode *tnode_free;
 127        };
 128        struct node *child[0];
 129};
 130
 131#ifdef CONFIG_IP_FIB_TRIE_STATS
 132struct trie_use_stats {
 133        unsigned int gets;
 134        unsigned int backtrack;
 135        unsigned int semantic_match_passed;
 136        unsigned int semantic_match_miss;
 137        unsigned int null_node_hit;
 138        unsigned int resize_node_skipped;
 139};
 140#endif
 141
 142struct trie_stat {
 143        unsigned int totdepth;
 144        unsigned int maxdepth;
 145        unsigned int tnodes;
 146        unsigned int leaves;
 147        unsigned int nullpointers;
 148        unsigned int prefixes;
 149        unsigned int nodesizes[MAX_STAT_DEPTH];
 150};
 151
 152struct trie {
 153        struct node *trie;
 154#ifdef CONFIG_IP_FIB_TRIE_STATS
 155        struct trie_use_stats stats;
 156#endif
 157};
 158
 159static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
 160static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
 161                                  int wasfull);
 162static struct node *resize(struct trie *t, struct tnode *tn);
 163static struct tnode *inflate(struct trie *t, struct tnode *tn);
 164static struct tnode *halve(struct trie *t, struct tnode *tn);
 165/* tnodes to free after resize(); protected by RTNL */
 166static struct tnode *tnode_free_head;
 167static size_t tnode_free_size;
 168
 169/*
 170 * synchronize_rcu after call_rcu for that many pages; it should be especially
 171 * useful before resizing the root node with PREEMPT_NONE configs; the value was
 172 * obtained experimentally, aiming to avoid visible slowdown.
 173 */
 174static const int sync_pages = 128;
 175
 176static struct kmem_cache *fn_alias_kmem __read_mostly;
 177static struct kmem_cache *trie_leaf_kmem __read_mostly;
 178
 179static inline struct tnode *node_parent(struct node *node)
 180{
 181        return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
 182}
 183
 184static inline struct tnode *node_parent_rcu(struct node *node)
 185{
 186        struct tnode *ret = node_parent(node);
 187
 188        return rcu_dereference(ret);
 189}
 190
 191/* Same as rcu_assign_pointer
 192 * but that macro() assumes that value is a pointer.
 193 */
 194static inline void node_set_parent(struct node *node, struct tnode *ptr)
 195{
 196        smp_wmb();
 197        node->parent = (unsigned long)ptr | NODE_TYPE(node);
 198}
 199
 200static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
 201{
 202        BUG_ON(i >= 1U << tn->bits);
 203
 204        return tn->child[i];
 205}
 206
 207static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
 208{
 209        struct node *ret = tnode_get_child(tn, i);
 210
 211        return rcu_dereference(ret);
 212}
 213
 214static inline int tnode_child_length(const struct tnode *tn)
 215{
 216        return 1 << tn->bits;
 217}
 218
 219static inline t_key mask_pfx(t_key k, unsigned short l)
 220{
 221        return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
 222}
 223
 224static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
 225{
 226        if (offset < KEYLENGTH)
 227                return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
 228        else
 229                return 0;
 230}
 231
 232static inline int tkey_equals(t_key a, t_key b)
 233{
 234        return a == b;
 235}
 236
 237static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
 238{
 239        if (bits == 0 || offset >= KEYLENGTH)
 240                return 1;
 241        bits = bits > KEYLENGTH ? KEYLENGTH : bits;
 242        return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
 243}
 244
 245static inline int tkey_mismatch(t_key a, int offset, t_key b)
 246{
 247        t_key diff = a ^ b;
 248        int i = offset;
 249
 250        if (!diff)
 251                return 0;
 252        while ((diff << i) >> (KEYLENGTH-1) == 0)
 253                i++;
 254        return i;
 255}
 256
 257/*
 258  To understand this stuff, an understanding of keys and all their bits is
 259  necessary. Every node in the trie has a key associated with it, but not
 260  all of the bits in that key are significant.
 261
 262  Consider a node 'n' and its parent 'tp'.
 263
 264  If n is a leaf, every bit in its key is significant. Its presence is
 265  necessitated by path compression, since during a tree traversal (when
 266  searching for a leaf - unless we are doing an insertion) we will completely
 267  ignore all skipped bits we encounter. Thus we need to verify, at the end of
 268  a potentially successful search, that we have indeed been walking the
 269  correct key path.
 270
 271  Note that we can never "miss" the correct key in the tree if present by
 272  following the wrong path. Path compression ensures that segments of the key
 273  that are the same for all keys with a given prefix are skipped, but the
 274  skipped part *is* identical for each node in the subtrie below the skipped
 275  bit! trie_insert() in this implementation takes care of that - note the
 276  call to tkey_sub_equals() in trie_insert().
 277
 278  if n is an internal node - a 'tnode' here, the various parts of its key
 279  have many different meanings.
 280
 281  Example:
 282  _________________________________________________________________
 283  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
 284  -----------------------------------------------------------------
 285    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 286
 287  _________________________________________________________________
 288  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
 289  -----------------------------------------------------------------
 290   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
 291
 292  tp->pos = 7
 293  tp->bits = 3
 294  n->pos = 15
 295  n->bits = 4
 296
 297  First, let's just ignore the bits that come before the parent tp, that is
 298  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
 299  not use them for anything.
 300
 301  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
 302  index into the parent's child array. That is, they will be used to find
 303  'n' among tp's children.
 304
 305  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
 306  for the node n.
 307
 308  All the bits we have seen so far are significant to the node n. The rest
 309  of the bits are really not needed or indeed known in n->key.
 310
 311  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
 312  n's child array, and will of course be different for each child.
 313
 314
 315  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
 316  at this point.
 317
 318*/
 319
 320static inline void check_tnode(const struct tnode *tn)
 321{
 322        WARN_ON(tn && tn->pos+tn->bits > 32);
 323}
 324
 325static const int halve_threshold = 25;
 326static const int inflate_threshold = 50;
 327static const int halve_threshold_root = 15;
 328static const int inflate_threshold_root = 30;
 329
 330static void __alias_free_mem(struct rcu_head *head)
 331{
 332        struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
 333        kmem_cache_free(fn_alias_kmem, fa);
 334}
 335
 336static inline void alias_free_mem_rcu(struct fib_alias *fa)
 337{
 338        call_rcu(&fa->rcu, __alias_free_mem);
 339}
 340
 341static void __leaf_free_rcu(struct rcu_head *head)
 342{
 343        struct leaf *l = container_of(head, struct leaf, rcu);
 344        kmem_cache_free(trie_leaf_kmem, l);
 345}
 346
 347static inline void free_leaf(struct leaf *l)
 348{
 349        call_rcu_bh(&l->rcu, __leaf_free_rcu);
 350}
 351
 352static void __leaf_info_free_rcu(struct rcu_head *head)
 353{
 354        kfree(container_of(head, struct leaf_info, rcu));
 355}
 356
 357static inline void free_leaf_info(struct leaf_info *leaf)
 358{
 359        call_rcu(&leaf->rcu, __leaf_info_free_rcu);
 360}
 361
 362static struct tnode *tnode_alloc(size_t size)
 363{
 364        if (size <= PAGE_SIZE)
 365                return kzalloc(size, GFP_KERNEL);
 366        else
 367                return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
 368}
 369
 370static void __tnode_vfree(struct work_struct *arg)
 371{
 372        struct tnode *tn = container_of(arg, struct tnode, work);
 373        vfree(tn);
 374}
 375
 376static void __tnode_free_rcu(struct rcu_head *head)
 377{
 378        struct tnode *tn = container_of(head, struct tnode, rcu);
 379        size_t size = sizeof(struct tnode) +
 380                      (sizeof(struct node *) << tn->bits);
 381
 382        if (size <= PAGE_SIZE)
 383                kfree(tn);
 384        else {
 385                INIT_WORK(&tn->work, __tnode_vfree);
 386                schedule_work(&tn->work);
 387        }
 388}
 389
 390static inline void tnode_free(struct tnode *tn)
 391{
 392        if (IS_LEAF(tn))
 393                free_leaf((struct leaf *) tn);
 394        else
 395                call_rcu(&tn->rcu, __tnode_free_rcu);
 396}
 397
 398static void tnode_free_safe(struct tnode *tn)
 399{
 400        BUG_ON(IS_LEAF(tn));
 401        tn->tnode_free = tnode_free_head;
 402        tnode_free_head = tn;
 403        tnode_free_size += sizeof(struct tnode) +
 404                           (sizeof(struct node *) << tn->bits);
 405}
 406
 407static void tnode_free_flush(void)
 408{
 409        struct tnode *tn;
 410
 411        while ((tn = tnode_free_head)) {
 412                tnode_free_head = tn->tnode_free;
 413                tn->tnode_free = NULL;
 414                tnode_free(tn);
 415        }
 416
 417        if (tnode_free_size >= PAGE_SIZE * sync_pages) {
 418                tnode_free_size = 0;
 419                synchronize_rcu();
 420        }
 421}
 422
 423static struct leaf *leaf_new(void)
 424{
 425        struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
 426        if (l) {
 427                l->parent = T_LEAF;
 428                INIT_HLIST_HEAD(&l->list);
 429        }
 430        return l;
 431}
 432
 433static struct leaf_info *leaf_info_new(int plen)
 434{
 435        struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
 436        if (li) {
 437                li->plen = plen;
 438                INIT_LIST_HEAD(&li->falh);
 439        }
 440        return li;
 441}
 442
 443static struct tnode *tnode_new(t_key key, int pos, int bits)
 444{
 445        size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
 446        struct tnode *tn = tnode_alloc(sz);
 447
 448        if (tn) {
 449                tn->parent = T_TNODE;
 450                tn->pos = pos;
 451                tn->bits = bits;
 452                tn->key = key;
 453                tn->full_children = 0;
 454                tn->empty_children = 1<<bits;
 455        }
 456
 457        pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
 458                 (unsigned long) (sizeof(struct node) << bits));
 459        return tn;
 460}
 461
 462/*
 463 * Check whether a tnode 'n' is "full", i.e. it is an internal node
 464 * and no bits are skipped. See discussion in dyntree paper p. 6
 465 */
 466
 467static inline int tnode_full(const struct tnode *tn, const struct node *n)
 468{
 469        if (n == NULL || IS_LEAF(n))
 470                return 0;
 471
 472        return ((struct tnode *) n)->pos == tn->pos + tn->bits;
 473}
 474
 475static inline void put_child(struct trie *t, struct tnode *tn, int i,
 476                             struct node *n)
 477{
 478        tnode_put_child_reorg(tn, i, n, -1);
 479}
 480
 481 /*
 482  * Add a child at position i overwriting the old value.
 483  * Update the value of full_children and empty_children.
 484  */
 485
 486static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
 487                                  int wasfull)
 488{
 489        struct node *chi = tn->child[i];
 490        int isfull;
 491
 492        BUG_ON(i >= 1<<tn->bits);
 493
 494        /* update emptyChildren */
 495        if (n == NULL && chi != NULL)
 496                tn->empty_children++;
 497        else if (n != NULL && chi == NULL)
 498                tn->empty_children--;
 499
 500        /* update fullChildren */
 501        if (wasfull == -1)
 502                wasfull = tnode_full(tn, chi);
 503
 504        isfull = tnode_full(tn, n);
 505        if (wasfull && !isfull)
 506                tn->full_children--;
 507        else if (!wasfull && isfull)
 508                tn->full_children++;
 509
 510        if (n)
 511                node_set_parent(n, tn);
 512
 513        rcu_assign_pointer(tn->child[i], n);
 514}
 515
 516#define MAX_WORK 10
 517static struct node *resize(struct trie *t, struct tnode *tn)
 518{
 519        int i;
 520        struct tnode *old_tn;
 521        int inflate_threshold_use;
 522        int halve_threshold_use;
 523        int max_work;
 524
 525        if (!tn)
 526                return NULL;
 527
 528        pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
 529                 tn, inflate_threshold, halve_threshold);
 530
 531        /* No children */
 532        if (tn->empty_children == tnode_child_length(tn)) {
 533                tnode_free_safe(tn);
 534                return NULL;
 535        }
 536        /* One child */
 537        if (tn->empty_children == tnode_child_length(tn) - 1)
 538                goto one_child;
 539        /*
 540         * Double as long as the resulting node has a number of
 541         * nonempty nodes that are above the threshold.
 542         */
 543
 544        /*
 545         * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
 546         * the Helsinki University of Technology and Matti Tikkanen of Nokia
 547         * Telecommunications, page 6:
 548         * "A node is doubled if the ratio of non-empty children to all
 549         * children in the *doubled* node is at least 'high'."
 550         *
 551         * 'high' in this instance is the variable 'inflate_threshold'. It
 552         * is expressed as a percentage, so we multiply it with
 553         * tnode_child_length() and instead of multiplying by 2 (since the
 554         * child array will be doubled by inflate()) and multiplying
 555         * the left-hand side by 100 (to handle the percentage thing) we
 556         * multiply the left-hand side by 50.
 557         *
 558         * The left-hand side may look a bit weird: tnode_child_length(tn)
 559         * - tn->empty_children is of course the number of non-null children
 560         * in the current node. tn->full_children is the number of "full"
 561         * children, that is non-null tnodes with a skip value of 0.
 562         * All of those will be doubled in the resulting inflated tnode, so
 563         * we just count them one extra time here.
 564         *
 565         * A clearer way to write this would be:
 566         *
 567         * to_be_doubled = tn->full_children;
 568         * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
 569         *     tn->full_children;
 570         *
 571         * new_child_length = tnode_child_length(tn) * 2;
 572         *
 573         * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
 574         *      new_child_length;
 575         * if (new_fill_factor >= inflate_threshold)
 576         *
 577         * ...and so on, tho it would mess up the while () loop.
 578         *
 579         * anyway,
 580         * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
 581         *      inflate_threshold
 582         *
 583         * avoid a division:
 584         * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
 585         *      inflate_threshold * new_child_length
 586         *
 587         * expand not_to_be_doubled and to_be_doubled, and shorten:
 588         * 100 * (tnode_child_length(tn) - tn->empty_children +
 589         *    tn->full_children) >= inflate_threshold * new_child_length
 590         *
 591         * expand new_child_length:
 592         * 100 * (tnode_child_length(tn) - tn->empty_children +
 593         *    tn->full_children) >=
 594         *      inflate_threshold * tnode_child_length(tn) * 2
 595         *
 596         * shorten again:
 597         * 50 * (tn->full_children + tnode_child_length(tn) -
 598         *    tn->empty_children) >= inflate_threshold *
 599         *    tnode_child_length(tn)
 600         *
 601         */
 602
 603        check_tnode(tn);
 604
 605        /* Keep root node larger  */
 606
 607        if (!node_parent((struct node*) tn)) {
 608                inflate_threshold_use = inflate_threshold_root;
 609                halve_threshold_use = halve_threshold_root;
 610        }
 611        else {
 612                inflate_threshold_use = inflate_threshold;
 613                halve_threshold_use = halve_threshold;
 614        }
 615
 616        max_work = MAX_WORK;
 617        while ((tn->full_children > 0 &&  max_work-- &&
 618                50 * (tn->full_children + tnode_child_length(tn)
 619                      - tn->empty_children)
 620                >= inflate_threshold_use * tnode_child_length(tn))) {
 621
 622                old_tn = tn;
 623                tn = inflate(t, tn);
 624
 625                if (IS_ERR(tn)) {
 626                        tn = old_tn;
 627#ifdef CONFIG_IP_FIB_TRIE_STATS
 628                        t->stats.resize_node_skipped++;
 629#endif
 630                        break;
 631                }
 632        }
 633
 634        check_tnode(tn);
 635
 636        /* Return if at least one inflate is run */
 637        if( max_work != MAX_WORK)
 638                return (struct node *) tn;
 639
 640        /*
 641         * Halve as long as the number of empty children in this
 642         * node is above threshold.
 643         */
 644
 645        max_work = MAX_WORK;
 646        while (tn->bits > 1 &&  max_work-- &&
 647               100 * (tnode_child_length(tn) - tn->empty_children) <
 648               halve_threshold_use * tnode_child_length(tn)) {
 649
 650                old_tn = tn;
 651                tn = halve(t, tn);
 652                if (IS_ERR(tn)) {
 653                        tn = old_tn;
 654#ifdef CONFIG_IP_FIB_TRIE_STATS
 655                        t->stats.resize_node_skipped++;
 656#endif
 657                        break;
 658                }
 659        }
 660
 661
 662        /* Only one child remains */
 663        if (tn->empty_children == tnode_child_length(tn) - 1) {
 664one_child:
 665                for (i = 0; i < tnode_child_length(tn); i++) {
 666                        struct node *n;
 667
 668                        n = tn->child[i];
 669                        if (!n)
 670                                continue;
 671
 672                        /* compress one level */
 673
 674                        node_set_parent(n, NULL);
 675                        tnode_free_safe(tn);
 676                        return n;
 677                }
 678        }
 679        return (struct node *) tn;
 680}
 681
 682static struct tnode *inflate(struct trie *t, struct tnode *tn)
 683{
 684        struct tnode *oldtnode = tn;
 685        int olen = tnode_child_length(tn);
 686        int i;
 687
 688        pr_debug("In inflate\n");
 689
 690        tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
 691
 692        if (!tn)
 693                return ERR_PTR(-ENOMEM);
 694
 695        /*
 696         * Preallocate and store tnodes before the actual work so we
 697         * don't get into an inconsistent state if memory allocation
 698         * fails. In case of failure we return the oldnode and  inflate
 699         * of tnode is ignored.
 700         */
 701
 702        for (i = 0; i < olen; i++) {
 703                struct tnode *inode;
 704
 705                inode = (struct tnode *) tnode_get_child(oldtnode, i);
 706                if (inode &&
 707                    IS_TNODE(inode) &&
 708                    inode->pos == oldtnode->pos + oldtnode->bits &&
 709                    inode->bits > 1) {
 710                        struct tnode *left, *right;
 711                        t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
 712
 713                        left = tnode_new(inode->key&(~m), inode->pos + 1,
 714                                         inode->bits - 1);
 715                        if (!left)
 716                                goto nomem;
 717
 718                        right = tnode_new(inode->key|m, inode->pos + 1,
 719                                          inode->bits - 1);
 720
 721                        if (!right) {
 722                                tnode_free(left);
 723                                goto nomem;
 724                        }
 725
 726                        put_child(t, tn, 2*i, (struct node *) left);
 727                        put_child(t, tn, 2*i+1, (struct node *) right);
 728                }
 729        }
 730
 731        for (i = 0; i < olen; i++) {
 732                struct tnode *inode;
 733                struct node *node = tnode_get_child(oldtnode, i);
 734                struct tnode *left, *right;
 735                int size, j;
 736
 737                /* An empty child */
 738                if (node == NULL)
 739                        continue;
 740
 741                /* A leaf or an internal node with skipped bits */
 742
 743                if (IS_LEAF(node) || ((struct tnode *) node)->pos >
 744                   tn->pos + tn->bits - 1) {
 745                        if (tkey_extract_bits(node->key,
 746                                              oldtnode->pos + oldtnode->bits,
 747                                              1) == 0)
 748                                put_child(t, tn, 2*i, node);
 749                        else
 750                                put_child(t, tn, 2*i+1, node);
 751                        continue;
 752                }
 753
 754                /* An internal node with two children */
 755                inode = (struct tnode *) node;
 756
 757                if (inode->bits == 1) {
 758                        put_child(t, tn, 2*i, inode->child[0]);
 759                        put_child(t, tn, 2*i+1, inode->child[1]);
 760
 761                        tnode_free_safe(inode);
 762                        continue;
 763                }
 764
 765                /* An internal node with more than two children */
 766
 767                /* We will replace this node 'inode' with two new
 768                 * ones, 'left' and 'right', each with half of the
 769                 * original children. The two new nodes will have
 770                 * a position one bit further down the key and this
 771                 * means that the "significant" part of their keys
 772                 * (see the discussion near the top of this file)
 773                 * will differ by one bit, which will be "0" in
 774                 * left's key and "1" in right's key. Since we are
 775                 * moving the key position by one step, the bit that
 776                 * we are moving away from - the bit at position
 777                 * (inode->pos) - is the one that will differ between
 778                 * left and right. So... we synthesize that bit in the
 779                 * two  new keys.
 780                 * The mask 'm' below will be a single "one" bit at
 781                 * the position (inode->pos)
 782                 */
 783
 784                /* Use the old key, but set the new significant
 785                 *   bit to zero.
 786                 */
 787
 788                left = (struct tnode *) tnode_get_child(tn, 2*i);
 789                put_child(t, tn, 2*i, NULL);
 790
 791                BUG_ON(!left);
 792
 793                right = (struct tnode *) tnode_get_child(tn, 2*i+1);
 794                put_child(t, tn, 2*i+1, NULL);
 795
 796                BUG_ON(!right);
 797
 798                size = tnode_child_length(left);
 799                for (j = 0; j < size; j++) {
 800                        put_child(t, left, j, inode->child[j]);
 801                        put_child(t, right, j, inode->child[j + size]);
 802                }
 803                put_child(t, tn, 2*i, resize(t, left));
 804                put_child(t, tn, 2*i+1, resize(t, right));
 805
 806                tnode_free_safe(inode);
 807        }
 808        tnode_free_safe(oldtnode);
 809        return tn;
 810nomem:
 811        {
 812                int size = tnode_child_length(tn);
 813                int j;
 814
 815                for (j = 0; j < size; j++)
 816                        if (tn->child[j])
 817                                tnode_free((struct tnode *)tn->child[j]);
 818
 819                tnode_free(tn);
 820
 821                return ERR_PTR(-ENOMEM);
 822        }
 823}
 824
 825static struct tnode *halve(struct trie *t, struct tnode *tn)
 826{
 827        struct tnode *oldtnode = tn;
 828        struct node *left, *right;
 829        int i;
 830        int olen = tnode_child_length(tn);
 831
 832        pr_debug("In halve\n");
 833
 834        tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
 835
 836        if (!tn)
 837                return ERR_PTR(-ENOMEM);
 838
 839        /*
 840         * Preallocate and store tnodes before the actual work so we
 841         * don't get into an inconsistent state if memory allocation
 842         * fails. In case of failure we return the oldnode and halve
 843         * of tnode is ignored.
 844         */
 845
 846        for (i = 0; i < olen; i += 2) {
 847                left = tnode_get_child(oldtnode, i);
 848                right = tnode_get_child(oldtnode, i+1);
 849
 850                /* Two nonempty children */
 851                if (left && right) {
 852                        struct tnode *newn;
 853
 854                        newn = tnode_new(left->key, tn->pos + tn->bits, 1);
 855
 856                        if (!newn)
 857                                goto nomem;
 858
 859                        put_child(t, tn, i/2, (struct node *)newn);
 860                }
 861
 862        }
 863
 864        for (i = 0; i < olen; i += 2) {
 865                struct tnode *newBinNode;
 866
 867                left = tnode_get_child(oldtnode, i);
 868                right = tnode_get_child(oldtnode, i+1);
 869
 870                /* At least one of the children is empty */
 871                if (left == NULL) {
 872                        if (right == NULL)    /* Both are empty */
 873                                continue;
 874                        put_child(t, tn, i/2, right);
 875                        continue;
 876                }
 877
 878                if (right == NULL) {
 879                        put_child(t, tn, i/2, left);
 880                        continue;
 881                }
 882
 883                /* Two nonempty children */
 884                newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
 885                put_child(t, tn, i/2, NULL);
 886                put_child(t, newBinNode, 0, left);
 887                put_child(t, newBinNode, 1, right);
 888                put_child(t, tn, i/2, resize(t, newBinNode));
 889        }
 890        tnode_free_safe(oldtnode);
 891        return tn;
 892nomem:
 893        {
 894                int size = tnode_child_length(tn);
 895                int j;
 896
 897                for (j = 0; j < size; j++)
 898                        if (tn->child[j])
 899                                tnode_free((struct tnode *)tn->child[j]);
 900
 901                tnode_free(tn);
 902
 903                return ERR_PTR(-ENOMEM);
 904        }
 905}
 906
 907/* readside must use rcu_read_lock currently dump routines
 908 via get_fa_head and dump */
 909
 910static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
 911{
 912        struct hlist_head *head = &l->list;
 913        struct hlist_node *node;
 914        struct leaf_info *li;
 915
 916        hlist_for_each_entry_rcu(li, node, head, hlist)
 917                if (li->plen == plen)
 918                        return li;
 919
 920        return NULL;
 921}
 922
 923static inline struct list_head *get_fa_head(struct leaf *l, int plen)
 924{
 925        struct leaf_info *li = find_leaf_info(l, plen);
 926
 927        if (!li)
 928                return NULL;
 929
 930        return &li->falh;
 931}
 932
 933static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
 934{
 935        struct leaf_info *li = NULL, *last = NULL;
 936        struct hlist_node *node;
 937
 938        if (hlist_empty(head)) {
 939                hlist_add_head_rcu(&new->hlist, head);
 940        } else {
 941                hlist_for_each_entry(li, node, head, hlist) {
 942                        if (new->plen > li->plen)
 943                                break;
 944
 945                        last = li;
 946                }
 947                if (last)
 948                        hlist_add_after_rcu(&last->hlist, &new->hlist);
 949                else
 950                        hlist_add_before_rcu(&new->hlist, &li->hlist);
 951        }
 952}
 953
 954/* rcu_read_lock needs to be hold by caller from readside */
 955
 956static struct leaf *
 957fib_find_node(struct trie *t, u32 key)
 958{
 959        int pos;
 960        struct tnode *tn;
 961        struct node *n;
 962
 963        pos = 0;
 964        n = rcu_dereference(t->trie);
 965
 966        while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
 967                tn = (struct tnode *) n;
 968
 969                check_tnode(tn);
 970
 971                if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
 972                        pos = tn->pos + tn->bits;
 973                        n = tnode_get_child_rcu(tn,
 974                                                tkey_extract_bits(key,
 975                                                                  tn->pos,
 976                                                                  tn->bits));
 977                } else
 978                        break;
 979        }
 980        /* Case we have found a leaf. Compare prefixes */
 981
 982        if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
 983                return (struct leaf *)n;
 984
 985        return NULL;
 986}
 987
 988static void trie_rebalance(struct trie *t, struct tnode *tn)
 989{
 990        int wasfull;
 991        t_key cindex, key;
 992        struct tnode *tp;
 993
 994        key = tn->key;
 995
 996        while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
 997                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
 998                wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
 999                tn = (struct tnode *) resize(t, (struct tnode *)tn);
1000
1001                tnode_put_child_reorg((struct tnode *)tp, cindex,
1002                                      (struct node *)tn, wasfull);
1003
1004                tp = node_parent((struct node *) tn);
1005                if (!tp)
1006                        rcu_assign_pointer(t->trie, (struct node *)tn);
1007
1008                tnode_free_flush();
1009                if (!tp)
1010                        break;
1011                tn = tp;
1012        }
1013
1014        /* Handle last (top) tnode */
1015        if (IS_TNODE(tn))
1016                tn = (struct tnode *)resize(t, (struct tnode *)tn);
1017
1018        rcu_assign_pointer(t->trie, (struct node *)tn);
1019        tnode_free_flush();
1020
1021        return;
1022}
1023
1024/* only used from updater-side */
1025
1026static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1027{
1028        int pos, newpos;
1029        struct tnode *tp = NULL, *tn = NULL;
1030        struct node *n;
1031        struct leaf *l;
1032        int missbit;
1033        struct list_head *fa_head = NULL;
1034        struct leaf_info *li;
1035        t_key cindex;
1036
1037        pos = 0;
1038        n = t->trie;
1039
1040        /* If we point to NULL, stop. Either the tree is empty and we should
1041         * just put a new leaf in if, or we have reached an empty child slot,
1042         * and we should just put our new leaf in that.
1043         * If we point to a T_TNODE, check if it matches our key. Note that
1044         * a T_TNODE might be skipping any number of bits - its 'pos' need
1045         * not be the parent's 'pos'+'bits'!
1046         *
1047         * If it does match the current key, get pos/bits from it, extract
1048         * the index from our key, push the T_TNODE and walk the tree.
1049         *
1050         * If it doesn't, we have to replace it with a new T_TNODE.
1051         *
1052         * If we point to a T_LEAF, it might or might not have the same key
1053         * as we do. If it does, just change the value, update the T_LEAF's
1054         * value, and return it.
1055         * If it doesn't, we need to replace it with a T_TNODE.
1056         */
1057
1058        while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1059                tn = (struct tnode *) n;
1060
1061                check_tnode(tn);
1062
1063                if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1064                        tp = tn;
1065                        pos = tn->pos + tn->bits;
1066                        n = tnode_get_child(tn,
1067                                            tkey_extract_bits(key,
1068                                                              tn->pos,
1069                                                              tn->bits));
1070
1071                        BUG_ON(n && node_parent(n) != tn);
1072                } else
1073                        break;
1074        }
1075
1076        /*
1077         * n  ----> NULL, LEAF or TNODE
1078         *
1079         * tp is n's (parent) ----> NULL or TNODE
1080         */
1081
1082        BUG_ON(tp && IS_LEAF(tp));
1083
1084        /* Case 1: n is a leaf. Compare prefixes */
1085
1086        if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1087                l = (struct leaf *) n;
1088                li = leaf_info_new(plen);
1089
1090                if (!li)
1091                        return NULL;
1092
1093                fa_head = &li->falh;
1094                insert_leaf_info(&l->list, li);
1095                goto done;
1096        }
1097        l = leaf_new();
1098
1099        if (!l)
1100                return NULL;
1101
1102        l->key = key;
1103        li = leaf_info_new(plen);
1104
1105        if (!li) {
1106                free_leaf(l);
1107                return NULL;
1108        }
1109
1110        fa_head = &li->falh;
1111        insert_leaf_info(&l->list, li);
1112
1113        if (t->trie && n == NULL) {
1114                /* Case 2: n is NULL, and will just insert a new leaf */
1115
1116                node_set_parent((struct node *)l, tp);
1117
1118                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1119                put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1120        } else {
1121                /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1122                /*
1123                 *  Add a new tnode here
1124                 *  first tnode need some special handling
1125                 */
1126
1127                if (tp)
1128                        pos = tp->pos+tp->bits;
1129                else
1130                        pos = 0;
1131
1132                if (n) {
1133                        newpos = tkey_mismatch(key, pos, n->key);
1134                        tn = tnode_new(n->key, newpos, 1);
1135                } else {
1136                        newpos = 0;
1137                        tn = tnode_new(key, newpos, 1); /* First tnode */
1138                }
1139
1140                if (!tn) {
1141                        free_leaf_info(li);
1142                        free_leaf(l);
1143                        return NULL;
1144                }
1145
1146                node_set_parent((struct node *)tn, tp);
1147
1148                missbit = tkey_extract_bits(key, newpos, 1);
1149                put_child(t, tn, missbit, (struct node *)l);
1150                put_child(t, tn, 1-missbit, n);
1151
1152                if (tp) {
1153                        cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1154                        put_child(t, (struct tnode *)tp, cindex,
1155                                  (struct node *)tn);
1156                } else {
1157                        rcu_assign_pointer(t->trie, (struct node *)tn);
1158                        tp = tn;
1159                }
1160        }
1161
1162        if (tp && tp->pos + tp->bits > 32)
1163                pr_warning("fib_trie"
1164                           " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1165                           tp, tp->pos, tp->bits, key, plen);
1166
1167        /* Rebalance the trie */
1168
1169        trie_rebalance(t, tp);
1170done:
1171        return fa_head;
1172}
1173
1174/*
1175 * Caller must hold RTNL.
1176 */
1177int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1178{
1179        struct trie *t = (struct trie *) tb->tb_data;
1180        struct fib_alias *fa, *new_fa;
1181        struct list_head *fa_head = NULL;
1182        struct fib_info *fi;
1183        int plen = cfg->fc_dst_len;
1184        u8 tos = cfg->fc_tos;
1185        u32 key, mask;
1186        int err;
1187        struct leaf *l;
1188
1189        if (plen > 32)
1190                return -EINVAL;
1191
1192        key = ntohl(cfg->fc_dst);
1193
1194        pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1195
1196        mask = ntohl(inet_make_mask(plen));
1197
1198        if (key & ~mask)
1199                return -EINVAL;
1200
1201        key = key & mask;
1202
1203        fi = fib_create_info(cfg);
1204        if (IS_ERR(fi)) {
1205                err = PTR_ERR(fi);
1206                goto err;
1207        }
1208
1209        l = fib_find_node(t, key);
1210        fa = NULL;
1211
1212        if (l) {
1213                fa_head = get_fa_head(l, plen);
1214                fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1215        }
1216
1217        /* Now fa, if non-NULL, points to the first fib alias
1218         * with the same keys [prefix,tos,priority], if such key already
1219         * exists or to the node before which we will insert new one.
1220         *
1221         * If fa is NULL, we will need to allocate a new one and
1222         * insert to the head of f.
1223         *
1224         * If f is NULL, no fib node matched the destination key
1225         * and we need to allocate a new one of those as well.
1226         */
1227
1228        if (fa && fa->fa_tos == tos &&
1229            fa->fa_info->fib_priority == fi->fib_priority) {
1230                struct fib_alias *fa_first, *fa_match;
1231
1232                err = -EEXIST;
1233                if (cfg->fc_nlflags & NLM_F_EXCL)
1234                        goto out;
1235
1236                /* We have 2 goals:
1237                 * 1. Find exact match for type, scope, fib_info to avoid
1238                 * duplicate routes
1239                 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1240                 */
1241                fa_match = NULL;
1242                fa_first = fa;
1243                fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1244                list_for_each_entry_continue(fa, fa_head, fa_list) {
1245                        if (fa->fa_tos != tos)
1246                                break;
1247                        if (fa->fa_info->fib_priority != fi->fib_priority)
1248                                break;
1249                        if (fa->fa_type == cfg->fc_type &&
1250                            fa->fa_scope == cfg->fc_scope &&
1251                            fa->fa_info == fi) {
1252                                fa_match = fa;
1253                                break;
1254                        }
1255                }
1256
1257                if (cfg->fc_nlflags & NLM_F_REPLACE) {
1258                        struct fib_info *fi_drop;
1259                        u8 state;
1260
1261                        fa = fa_first;
1262                        if (fa_match) {
1263                                if (fa == fa_match)
1264                                        err = 0;
1265                                goto out;
1266                        }
1267                        err = -ENOBUFS;
1268                        new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1269                        if (new_fa == NULL)
1270                                goto out;
1271
1272                        fi_drop = fa->fa_info;
1273                        new_fa->fa_tos = fa->fa_tos;
1274                        new_fa->fa_info = fi;
1275                        new_fa->fa_type = cfg->fc_type;
1276                        new_fa->fa_scope = cfg->fc_scope;
1277                        state = fa->fa_state;
1278                        new_fa->fa_state = state & ~FA_S_ACCESSED;
1279
1280                        list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1281                        alias_free_mem_rcu(fa);
1282
1283                        fib_release_info(fi_drop);
1284                        if (state & FA_S_ACCESSED)
1285                                rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1286                        rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1287                                tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1288
1289                        goto succeeded;
1290                }
1291                /* Error if we find a perfect match which
1292                 * uses the same scope, type, and nexthop
1293                 * information.
1294                 */
1295                if (fa_match)
1296                        goto out;
1297
1298                if (!(cfg->fc_nlflags & NLM_F_APPEND))
1299                        fa = fa_first;
1300        }
1301        err = -ENOENT;
1302        if (!(cfg->fc_nlflags & NLM_F_CREATE))
1303                goto out;
1304
1305        err = -ENOBUFS;
1306        new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1307        if (new_fa == NULL)
1308                goto out;
1309
1310        new_fa->fa_info = fi;
1311        new_fa->fa_tos = tos;
1312        new_fa->fa_type = cfg->fc_type;
1313        new_fa->fa_scope = cfg->fc_scope;
1314        new_fa->fa_state = 0;
1315        /*
1316         * Insert new entry to the list.
1317         */
1318
1319        if (!fa_head) {
1320                fa_head = fib_insert_node(t, key, plen);
1321                if (unlikely(!fa_head)) {
1322                        err = -ENOMEM;
1323                        goto out_free_new_fa;
1324                }
1325        }
1326
1327        list_add_tail_rcu(&new_fa->fa_list,
1328                          (fa ? &fa->fa_list : fa_head));
1329
1330        rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1331        rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1332                  &cfg->fc_nlinfo, 0);
1333succeeded:
1334        return 0;
1335
1336out_free_new_fa:
1337        kmem_cache_free(fn_alias_kmem, new_fa);
1338out:
1339        fib_release_info(fi);
1340err:
1341        return err;
1342}
1343
1344/* should be called with rcu_read_lock */
1345static int check_leaf(struct trie *t, struct leaf *l,
1346                      t_key key,  const struct flowi *flp,
1347                      struct fib_result *res)
1348{
1349        struct leaf_info *li;
1350        struct hlist_head *hhead = &l->list;
1351        struct hlist_node *node;
1352
1353        hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1354                int err;
1355                int plen = li->plen;
1356                __be32 mask = inet_make_mask(plen);
1357
1358                if (l->key != (key & ntohl(mask)))
1359                        continue;
1360
1361                err = fib_semantic_match(&li->falh, flp, res, plen);
1362
1363#ifdef CONFIG_IP_FIB_TRIE_STATS
1364                if (err <= 0)
1365                        t->stats.semantic_match_passed++;
1366                else
1367                        t->stats.semantic_match_miss++;
1368#endif
1369                if (err <= 0)
1370                        return err;
1371        }
1372
1373        return 1;
1374}
1375
1376int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1377                     struct fib_result *res)
1378{
1379        struct trie *t = (struct trie *) tb->tb_data;
1380        int ret;
1381        struct node *n;
1382        struct tnode *pn;
1383        int pos, bits;
1384        t_key key = ntohl(flp->fl4_dst);
1385        int chopped_off;
1386        t_key cindex = 0;
1387        int current_prefix_length = KEYLENGTH;
1388        struct tnode *cn;
1389        t_key node_prefix, key_prefix, pref_mismatch;
1390        int mp;
1391
1392        rcu_read_lock();
1393
1394        n = rcu_dereference(t->trie);
1395        if (!n)
1396                goto failed;
1397
1398#ifdef CONFIG_IP_FIB_TRIE_STATS
1399        t->stats.gets++;
1400#endif
1401
1402        /* Just a leaf? */
1403        if (IS_LEAF(n)) {
1404                ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1405                goto found;
1406        }
1407
1408        pn = (struct tnode *) n;
1409        chopped_off = 0;
1410
1411        while (pn) {
1412                pos = pn->pos;
1413                bits = pn->bits;
1414
1415                if (!chopped_off)
1416                        cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1417                                                   pos, bits);
1418
1419                n = tnode_get_child_rcu(pn, cindex);
1420
1421                if (n == NULL) {
1422#ifdef CONFIG_IP_FIB_TRIE_STATS
1423                        t->stats.null_node_hit++;
1424#endif
1425                        goto backtrace;
1426                }
1427
1428                if (IS_LEAF(n)) {
1429                        ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1430                        if (ret > 0)
1431                                goto backtrace;
1432                        goto found;
1433                }
1434
1435                cn = (struct tnode *)n;
1436
1437                /*
1438                 * It's a tnode, and we can do some extra checks here if we
1439                 * like, to avoid descending into a dead-end branch.
1440                 * This tnode is in the parent's child array at index
1441                 * key[p_pos..p_pos+p_bits] but potentially with some bits
1442                 * chopped off, so in reality the index may be just a
1443                 * subprefix, padded with zero at the end.
1444                 * We can also take a look at any skipped bits in this
1445                 * tnode - everything up to p_pos is supposed to be ok,
1446                 * and the non-chopped bits of the index (se previous
1447                 * paragraph) are also guaranteed ok, but the rest is
1448                 * considered unknown.
1449                 *
1450                 * The skipped bits are key[pos+bits..cn->pos].
1451                 */
1452
1453                /* If current_prefix_length < pos+bits, we are already doing
1454                 * actual prefix  matching, which means everything from
1455                 * pos+(bits-chopped_off) onward must be zero along some
1456                 * branch of this subtree - otherwise there is *no* valid
1457                 * prefix present. Here we can only check the skipped
1458                 * bits. Remember, since we have already indexed into the
1459                 * parent's child array, we know that the bits we chopped of
1460                 * *are* zero.
1461                 */
1462
1463                /* NOTA BENE: Checking only skipped bits
1464                   for the new node here */
1465
1466                if (current_prefix_length < pos+bits) {
1467                        if (tkey_extract_bits(cn->key, current_prefix_length,
1468                                                cn->pos - current_prefix_length)
1469                            || !(cn->child[0]))
1470                                goto backtrace;
1471                }
1472
1473                /*
1474                 * If chopped_off=0, the index is fully validated and we
1475                 * only need to look at the skipped bits for this, the new,
1476                 * tnode. What we actually want to do is to find out if
1477                 * these skipped bits match our key perfectly, or if we will
1478                 * have to count on finding a matching prefix further down,
1479                 * because if we do, we would like to have some way of
1480                 * verifying the existence of such a prefix at this point.
1481                 */
1482
1483                /* The only thing we can do at this point is to verify that
1484                 * any such matching prefix can indeed be a prefix to our
1485                 * key, and if the bits in the node we are inspecting that
1486                 * do not match our key are not ZERO, this cannot be true.
1487                 * Thus, find out where there is a mismatch (before cn->pos)
1488                 * and verify that all the mismatching bits are zero in the
1489                 * new tnode's key.
1490                 */
1491
1492                /*
1493                 * Note: We aren't very concerned about the piece of
1494                 * the key that precede pn->pos+pn->bits, since these
1495                 * have already been checked. The bits after cn->pos
1496                 * aren't checked since these are by definition
1497                 * "unknown" at this point. Thus, what we want to see
1498                 * is if we are about to enter the "prefix matching"
1499                 * state, and in that case verify that the skipped
1500                 * bits that will prevail throughout this subtree are
1501                 * zero, as they have to be if we are to find a
1502                 * matching prefix.
1503                 */
1504
1505                node_prefix = mask_pfx(cn->key, cn->pos);
1506                key_prefix = mask_pfx(key, cn->pos);
1507                pref_mismatch = key_prefix^node_prefix;
1508                mp = 0;
1509
1510                /*
1511                 * In short: If skipped bits in this node do not match
1512                 * the search key, enter the "prefix matching"
1513                 * state.directly.
1514                 */
1515                if (pref_mismatch) {
1516                        while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1517                                mp++;
1518                                pref_mismatch = pref_mismatch << 1;
1519                        }
1520                        key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1521
1522                        if (key_prefix != 0)
1523                                goto backtrace;
1524
1525                        if (current_prefix_length >= cn->pos)
1526                                current_prefix_length = mp;
1527                }
1528
1529                pn = (struct tnode *)n; /* Descend */
1530                chopped_off = 0;
1531                continue;
1532
1533backtrace:
1534                chopped_off++;
1535
1536                /* As zero don't change the child key (cindex) */
1537                while ((chopped_off <= pn->bits)
1538                       && !(cindex & (1<<(chopped_off-1))))
1539                        chopped_off++;
1540
1541                /* Decrease current_... with bits chopped off */
1542                if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1543                        current_prefix_length = pn->pos + pn->bits
1544                                - chopped_off;
1545
1546                /*
1547                 * Either we do the actual chop off according or if we have
1548                 * chopped off all bits in this tnode walk up to our parent.
1549                 */
1550
1551                if (chopped_off <= pn->bits) {
1552                        cindex &= ~(1 << (chopped_off-1));
1553                } else {
1554                        struct tnode *parent = node_parent_rcu((struct node *) pn);
1555                        if (!parent)
1556                                goto failed;
1557
1558                        /* Get Child's index */
1559                        cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1560                        pn = parent;
1561                        chopped_off = 0;
1562
1563#ifdef CONFIG_IP_FIB_TRIE_STATS
1564                        t->stats.backtrack++;
1565#endif
1566                        goto backtrace;
1567                }
1568        }
1569failed:
1570        ret = 1;
1571found:
1572        rcu_read_unlock();
1573        return ret;
1574}
1575
1576/*
1577 * Remove the leaf and return parent.
1578 */
1579static void trie_leaf_remove(struct trie *t, struct leaf *l)
1580{
1581        struct tnode *tp = node_parent((struct node *) l);
1582
1583        pr_debug("entering trie_leaf_remove(%p)\n", l);
1584
1585        if (tp) {
1586                t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1587                put_child(t, (struct tnode *)tp, cindex, NULL);
1588                trie_rebalance(t, tp);
1589        } else
1590                rcu_assign_pointer(t->trie, NULL);
1591
1592        free_leaf(l);
1593}
1594
1595/*
1596 * Caller must hold RTNL.
1597 */
1598int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1599{
1600        struct trie *t = (struct trie *) tb->tb_data;
1601        u32 key, mask;
1602        int plen = cfg->fc_dst_len;
1603        u8 tos = cfg->fc_tos;
1604        struct fib_alias *fa, *fa_to_delete;
1605        struct list_head *fa_head;
1606        struct leaf *l;
1607        struct leaf_info *li;
1608
1609        if (plen > 32)
1610                return -EINVAL;
1611
1612        key = ntohl(cfg->fc_dst);
1613        mask = ntohl(inet_make_mask(plen));
1614
1615        if (key & ~mask)
1616                return -EINVAL;
1617
1618        key = key & mask;
1619        l = fib_find_node(t, key);
1620
1621        if (!l)
1622                return -ESRCH;
1623
1624        fa_head = get_fa_head(l, plen);
1625        fa = fib_find_alias(fa_head, tos, 0);
1626
1627        if (!fa)
1628                return -ESRCH;
1629
1630        pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1631
1632        fa_to_delete = NULL;
1633        fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1634        list_for_each_entry_continue(fa, fa_head, fa_list) {
1635                struct fib_info *fi = fa->fa_info;
1636
1637                if (fa->fa_tos != tos)
1638                        break;
1639
1640                if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1641                    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1642                     fa->fa_scope == cfg->fc_scope) &&
1643                    (!cfg->fc_protocol ||
1644                     fi->fib_protocol == cfg->fc_protocol) &&
1645                    fib_nh_match(cfg, fi) == 0) {
1646                        fa_to_delete = fa;
1647                        break;
1648                }
1649        }
1650
1651        if (!fa_to_delete)
1652                return -ESRCH;
1653
1654        fa = fa_to_delete;
1655        rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1656                  &cfg->fc_nlinfo, 0);
1657
1658        l = fib_find_node(t, key);
1659        li = find_leaf_info(l, plen);
1660
1661        list_del_rcu(&fa->fa_list);
1662
1663        if (list_empty(fa_head)) {
1664                hlist_del_rcu(&li->hlist);
1665                free_leaf_info(li);
1666        }
1667
1668        if (hlist_empty(&l->list))
1669                trie_leaf_remove(t, l);
1670
1671        if (fa->fa_state & FA_S_ACCESSED)
1672                rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1673
1674        fib_release_info(fa->fa_info);
1675        alias_free_mem_rcu(fa);
1676        return 0;
1677}
1678
1679static int trie_flush_list(struct list_head *head)
1680{
1681        struct fib_alias *fa, *fa_node;
1682        int found = 0;
1683
1684        list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1685                struct fib_info *fi = fa->fa_info;
1686
1687                if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1688                        list_del_rcu(&fa->fa_list);
1689                        fib_release_info(fa->fa_info);
1690                        alias_free_mem_rcu(fa);
1691                        found++;
1692                }
1693        }
1694        return found;
1695}
1696
1697static int trie_flush_leaf(struct leaf *l)
1698{
1699        int found = 0;
1700        struct hlist_head *lih = &l->list;
1701        struct hlist_node *node, *tmp;
1702        struct leaf_info *li = NULL;
1703
1704        hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1705                found += trie_flush_list(&li->falh);
1706
1707                if (list_empty(&li->falh)) {
1708                        hlist_del_rcu(&li->hlist);
1709                        free_leaf_info(li);
1710                }
1711        }
1712        return found;
1713}
1714
1715/*
1716 * Scan for the next right leaf starting at node p->child[idx]
1717 * Since we have back pointer, no recursion necessary.
1718 */
1719static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1720{
1721        do {
1722                t_key idx;
1723
1724                if (c)
1725                        idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1726                else
1727                        idx = 0;
1728
1729                while (idx < 1u << p->bits) {
1730                        c = tnode_get_child_rcu(p, idx++);
1731                        if (!c)
1732                                continue;
1733
1734                        if (IS_LEAF(c)) {
1735                                prefetch(p->child[idx]);
1736                                return (struct leaf *) c;
1737                        }
1738
1739                        /* Rescan start scanning in new node */
1740                        p = (struct tnode *) c;
1741                        idx = 0;
1742                }
1743
1744                /* Node empty, walk back up to parent */
1745                c = (struct node *) p;
1746        } while ( (p = node_parent_rcu(c)) != NULL);
1747
1748        return NULL; /* Root of trie */
1749}
1750
1751static struct leaf *trie_firstleaf(struct trie *t)
1752{
1753        struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
1754
1755        if (!n)
1756                return NULL;
1757
1758        if (IS_LEAF(n))          /* trie is just a leaf */
1759                return (struct leaf *) n;
1760
1761        return leaf_walk_rcu(n, NULL);
1762}
1763
1764static struct leaf *trie_nextleaf(struct leaf *l)
1765{
1766        struct node *c = (struct node *) l;
1767        struct tnode *p = node_parent_rcu(c);
1768
1769        if (!p)
1770                return NULL;    /* trie with just one leaf */
1771
1772        return leaf_walk_rcu(p, c);
1773}
1774
1775static struct leaf *trie_leafindex(struct trie *t, int index)
1776{
1777        struct leaf *l = trie_firstleaf(t);
1778
1779        while (l && index-- > 0)
1780                l = trie_nextleaf(l);
1781
1782        return l;
1783}
1784
1785
1786/*
1787 * Caller must hold RTNL.
1788 */
1789int fib_table_flush(struct fib_table *tb)
1790{
1791        struct trie *t = (struct trie *) tb->tb_data;
1792        struct leaf *l, *ll = NULL;
1793        int found = 0;
1794
1795        for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1796                found += trie_flush_leaf(l);
1797
1798                if (ll && hlist_empty(&ll->list))
1799                        trie_leaf_remove(t, ll);
1800                ll = l;
1801        }
1802
1803        if (ll && hlist_empty(&ll->list))
1804                trie_leaf_remove(t, ll);
1805
1806        pr_debug("trie_flush found=%d\n", found);
1807        return found;
1808}
1809
1810void fib_table_select_default(struct fib_table *tb,
1811                              const struct flowi *flp,
1812                              struct fib_result *res)
1813{
1814        struct trie *t = (struct trie *) tb->tb_data;
1815        int order, last_idx;
1816        struct fib_info *fi = NULL;
1817        struct fib_info *last_resort;
1818        struct fib_alias *fa = NULL;
1819        struct list_head *fa_head;
1820        struct leaf *l;
1821
1822        last_idx = -1;
1823        last_resort = NULL;
1824        order = -1;
1825
1826        rcu_read_lock();
1827
1828        l = fib_find_node(t, 0);
1829        if (!l)
1830                goto out;
1831
1832        fa_head = get_fa_head(l, 0);
1833        if (!fa_head)
1834                goto out;
1835
1836        if (list_empty(fa_head))
1837                goto out;
1838
1839        list_for_each_entry_rcu(fa, fa_head, fa_list) {
1840                struct fib_info *next_fi = fa->fa_info;
1841
1842                if (fa->fa_scope != res->scope ||
1843                    fa->fa_type != RTN_UNICAST)
1844                        continue;
1845
1846                if (next_fi->fib_priority > res->fi->fib_priority)
1847                        break;
1848                if (!next_fi->fib_nh[0].nh_gw ||
1849                    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1850                        continue;
1851                fa->fa_state |= FA_S_ACCESSED;
1852
1853                if (fi == NULL) {
1854                        if (next_fi != res->fi)
1855                                break;
1856                } else if (!fib_detect_death(fi, order, &last_resort,
1857                                             &last_idx, tb->tb_default)) {
1858                        fib_result_assign(res, fi);
1859                        tb->tb_default = order;
1860                        goto out;
1861                }
1862                fi = next_fi;
1863                order++;
1864        }
1865        if (order <= 0 || fi == NULL) {
1866                tb->tb_default = -1;
1867                goto out;
1868        }
1869
1870        if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1871                                tb->tb_default)) {
1872                fib_result_assign(res, fi);
1873                tb->tb_default = order;
1874                goto out;
1875        }
1876        if (last_idx >= 0)
1877                fib_result_assign(res, last_resort);
1878        tb->tb_default = last_idx;
1879out:
1880        rcu_read_unlock();
1881}
1882
1883static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1884                           struct fib_table *tb,
1885                           struct sk_buff *skb, struct netlink_callback *cb)
1886{
1887        int i, s_i;
1888        struct fib_alias *fa;
1889        __be32 xkey = htonl(key);
1890
1891        s_i = cb->args[5];
1892        i = 0;
1893
1894        /* rcu_read_lock is hold by caller */
1895
1896        list_for_each_entry_rcu(fa, fah, fa_list) {
1897                if (i < s_i) {
1898                        i++;
1899                        continue;
1900                }
1901
1902                if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1903                                  cb->nlh->nlmsg_seq,
1904                                  RTM_NEWROUTE,
1905                                  tb->tb_id,
1906                                  fa->fa_type,
1907                                  fa->fa_scope,
1908                                  xkey,
1909                                  plen,
1910                                  fa->fa_tos,
1911                                  fa->fa_info, NLM_F_MULTI) < 0) {
1912                        cb->args[5] = i;
1913                        return -1;
1914                }
1915                i++;
1916        }
1917        cb->args[5] = i;
1918        return skb->len;
1919}
1920
1921static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1922                        struct sk_buff *skb, struct netlink_callback *cb)
1923{
1924        struct leaf_info *li;
1925        struct hlist_node *node;
1926        int i, s_i;
1927
1928        s_i = cb->args[4];
1929        i = 0;
1930
1931        /* rcu_read_lock is hold by caller */
1932        hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1933                if (i < s_i) {
1934                        i++;
1935                        continue;
1936                }
1937
1938                if (i > s_i)
1939                        cb->args[5] = 0;
1940
1941                if (list_empty(&li->falh))
1942                        continue;
1943
1944                if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1945                        cb->args[4] = i;
1946                        return -1;
1947                }
1948                i++;
1949        }
1950
1951        cb->args[4] = i;
1952        return skb->len;
1953}
1954
1955int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1956                   struct netlink_callback *cb)
1957{
1958        struct leaf *l;
1959        struct trie *t = (struct trie *) tb->tb_data;
1960        t_key key = cb->args[2];
1961        int count = cb->args[3];
1962
1963        rcu_read_lock();
1964        /* Dump starting at last key.
1965         * Note: 0.0.0.0/0 (ie default) is first key.
1966         */
1967        if (count == 0)
1968                l = trie_firstleaf(t);
1969        else {
1970                /* Normally, continue from last key, but if that is missing
1971                 * fallback to using slow rescan
1972                 */
1973                l = fib_find_node(t, key);
1974                if (!l)
1975                        l = trie_leafindex(t, count);
1976        }
1977
1978        while (l) {
1979                cb->args[2] = l->key;
1980                if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1981                        cb->args[3] = count;
1982                        rcu_read_unlock();
1983                        return -1;
1984                }
1985
1986                ++count;
1987                l = trie_nextleaf(l);
1988                memset(&cb->args[4], 0,
1989                       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1990        }
1991        cb->args[3] = count;
1992        rcu_read_unlock();
1993
1994        return skb->len;
1995}
1996
1997void __init fib_hash_init(void)
1998{
1999        fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2000                                          sizeof(struct fib_alias),
2001                                          0, SLAB_PANIC, NULL);
2002
2003        trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
2004                                           max(sizeof(struct leaf),
2005                                               sizeof(struct leaf_info)),
2006                                           0, SLAB_PANIC, NULL);
2007}
2008
2009
2010/* Fix more generic FIB names for init later */
2011struct fib_table *fib_hash_table(u32 id)
2012{
2013        struct fib_table *tb;
2014        struct trie *t;
2015
2016        tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2017                     GFP_KERNEL);
2018        if (tb == NULL)
2019                return NULL;
2020
2021        tb->tb_id = id;
2022        tb->tb_default = -1;
2023
2024        t = (struct trie *) tb->tb_data;
2025        memset(t, 0, sizeof(*t));
2026
2027        if (id == RT_TABLE_LOCAL)
2028                pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2029
2030        return tb;
2031}
2032
2033#ifdef CONFIG_PROC_FS
2034/* Depth first Trie walk iterator */
2035struct fib_trie_iter {
2036        struct seq_net_private p;
2037        struct fib_table *tb;
2038        struct tnode *tnode;
2039        unsigned index;
2040        unsigned depth;
2041};
2042
2043static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2044{
2045        struct tnode *tn = iter->tnode;
2046        unsigned cindex = iter->index;
2047        struct tnode *p;
2048
2049        /* A single entry routing table */
2050        if (!tn)
2051                return NULL;
2052
2053        pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2054                 iter->tnode, iter->index, iter->depth);
2055rescan:
2056        while (cindex < (1<<tn->bits)) {
2057                struct node *n = tnode_get_child_rcu(tn, cindex);
2058
2059                if (n) {
2060                        if (IS_LEAF(n)) {
2061                                iter->tnode = tn;
2062                                iter->index = cindex + 1;
2063                        } else {
2064                                /* push down one level */
2065                                iter->tnode = (struct tnode *) n;
2066                                iter->index = 0;
2067                                ++iter->depth;
2068                        }
2069                        return n;
2070                }
2071
2072                ++cindex;
2073        }
2074
2075        /* Current node exhausted, pop back up */
2076        p = node_parent_rcu((struct node *)tn);
2077        if (p) {
2078                cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2079                tn = p;
2080                --iter->depth;
2081                goto rescan;
2082        }
2083
2084        /* got root? */
2085        return NULL;
2086}
2087
2088static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2089                                       struct trie *t)
2090{
2091        struct node *n;
2092
2093        if (!t)
2094                return NULL;
2095
2096        n = rcu_dereference(t->trie);
2097        if (!n)
2098                return NULL;
2099
2100        if (IS_TNODE(n)) {
2101                iter->tnode = (struct tnode *) n;
2102                iter->index = 0;
2103                iter->depth = 1;
2104        } else {
2105                iter->tnode = NULL;
2106                iter->index = 0;
2107                iter->depth = 0;
2108        }
2109
2110        return n;
2111}
2112
2113static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2114{
2115        struct node *n;
2116        struct fib_trie_iter iter;
2117
2118        memset(s, 0, sizeof(*s));
2119
2120        rcu_read_lock();
2121        for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2122                if (IS_LEAF(n)) {
2123                        struct leaf *l = (struct leaf *)n;
2124                        struct leaf_info *li;
2125                        struct hlist_node *tmp;
2126
2127                        s->leaves++;
2128                        s->totdepth += iter.depth;
2129                        if (iter.depth > s->maxdepth)
2130                                s->maxdepth = iter.depth;
2131
2132                        hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2133                                ++s->prefixes;
2134                } else {
2135                        const struct tnode *tn = (const struct tnode *) n;
2136                        int i;
2137
2138                        s->tnodes++;
2139                        if (tn->bits < MAX_STAT_DEPTH)
2140                                s->nodesizes[tn->bits]++;
2141
2142                        for (i = 0; i < (1<<tn->bits); i++)
2143                                if (!tn->child[i])
2144                                        s->nullpointers++;
2145                }
2146        }
2147        rcu_read_unlock();
2148}
2149
2150/*
2151 *      This outputs /proc/net/fib_triestats
2152 */
2153static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2154{
2155        unsigned i, max, pointers, bytes, avdepth;
2156
2157        if (stat->leaves)
2158                avdepth = stat->totdepth*100 / stat->leaves;
2159        else
2160                avdepth = 0;
2161
2162        seq_printf(seq, "\tAver depth:     %u.%02d\n",
2163                   avdepth / 100, avdepth % 100);
2164        seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2165
2166        seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2167        bytes = sizeof(struct leaf) * stat->leaves;
2168
2169        seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2170        bytes += sizeof(struct leaf_info) * stat->prefixes;
2171
2172        seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2173        bytes += sizeof(struct tnode) * stat->tnodes;
2174
2175        max = MAX_STAT_DEPTH;
2176        while (max > 0 && stat->nodesizes[max-1] == 0)
2177                max--;
2178
2179        pointers = 0;
2180        for (i = 1; i <= max; i++)
2181                if (stat->nodesizes[i] != 0) {
2182                        seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2183                        pointers += (1<<i) * stat->nodesizes[i];
2184                }
2185        seq_putc(seq, '\n');
2186        seq_printf(seq, "\tPointers: %u\n", pointers);
2187
2188        bytes += sizeof(struct node *) * pointers;
2189        seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2190        seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2191}
2192
2193#ifdef CONFIG_IP_FIB_TRIE_STATS
2194static void trie_show_usage(struct seq_file *seq,
2195                            const struct trie_use_stats *stats)
2196{
2197        seq_printf(seq, "\nCounters:\n---------\n");
2198        seq_printf(seq, "gets = %u\n", stats->gets);
2199        seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2200        seq_printf(seq, "semantic match passed = %u\n",
2201                   stats->semantic_match_passed);
2202        seq_printf(seq, "semantic match miss = %u\n",
2203                   stats->semantic_match_miss);
2204        seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2205        seq_printf(seq, "skipped node resize = %u\n\n",
2206                   stats->resize_node_skipped);
2207}
2208#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2209
2210static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2211{
2212        if (tb->tb_id == RT_TABLE_LOCAL)
2213                seq_puts(seq, "Local:\n");
2214        else if (tb->tb_id == RT_TABLE_MAIN)
2215                seq_puts(seq, "Main:\n");
2216        else
2217                seq_printf(seq, "Id %d:\n", tb->tb_id);
2218}
2219
2220
2221static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2222{
2223        struct net *net = (struct net *)seq->private;
2224        unsigned int h;
2225
2226        seq_printf(seq,
2227                   "Basic info: size of leaf:"
2228                   " %Zd bytes, size of tnode: %Zd bytes.\n",
2229                   sizeof(struct leaf), sizeof(struct tnode));
2230
2231        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2232                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2233                struct hlist_node *node;
2234                struct fib_table *tb;
2235
2236                hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2237                        struct trie *t = (struct trie *) tb->tb_data;
2238                        struct trie_stat stat;
2239
2240                        if (!t)
2241                                continue;
2242
2243                        fib_table_print(seq, tb);
2244
2245                        trie_collect_stats(t, &stat);
2246                        trie_show_stats(seq, &stat);
2247#ifdef CONFIG_IP_FIB_TRIE_STATS
2248                        trie_show_usage(seq, &t->stats);
2249#endif
2250                }
2251        }
2252
2253        return 0;
2254}
2255
2256static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2257{
2258        return single_open_net(inode, file, fib_triestat_seq_show);
2259}
2260
2261static const struct file_operations fib_triestat_fops = {
2262        .owner  = THIS_MODULE,
2263        .open   = fib_triestat_seq_open,
2264        .read   = seq_read,
2265        .llseek = seq_lseek,
2266        .release = single_release_net,
2267};
2268
2269static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2270{
2271        struct fib_trie_iter *iter = seq->private;
2272        struct net *net = seq_file_net(seq);
2273        loff_t idx = 0;
2274        unsigned int h;
2275
2276        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2277                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2278                struct hlist_node *node;
2279                struct fib_table *tb;
2280
2281                hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2282                        struct node *n;
2283
2284                        for (n = fib_trie_get_first(iter,
2285                                                    (struct trie *) tb->tb_data);
2286                             n; n = fib_trie_get_next(iter))
2287                                if (pos == idx++) {
2288                                        iter->tb = tb;
2289                                        return n;
2290                                }
2291                }
2292        }
2293
2294        return NULL;
2295}
2296
2297static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2298        __acquires(RCU)
2299{
2300        rcu_read_lock();
2301        return fib_trie_get_idx(seq, *pos);
2302}
2303
2304static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2305{
2306        struct fib_trie_iter *iter = seq->private;
2307        struct net *net = seq_file_net(seq);
2308        struct fib_table *tb = iter->tb;
2309        struct hlist_node *tb_node;
2310        unsigned int h;
2311        struct node *n;
2312
2313        ++*pos;
2314        /* next node in same table */
2315        n = fib_trie_get_next(iter);
2316        if (n)
2317                return n;
2318
2319        /* walk rest of this hash chain */
2320        h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2321        while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
2322                tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2323                n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2324                if (n)
2325                        goto found;
2326        }
2327
2328        /* new hash chain */
2329        while (++h < FIB_TABLE_HASHSZ) {
2330                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2331                hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2332                        n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2333                        if (n)
2334                                goto found;
2335                }
2336        }
2337        return NULL;
2338
2339found:
2340        iter->tb = tb;
2341        return n;
2342}
2343
2344static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2345        __releases(RCU)
2346{
2347        rcu_read_unlock();
2348}
2349
2350static void seq_indent(struct seq_file *seq, int n)
2351{
2352        while (n-- > 0) seq_puts(seq, "   ");
2353}
2354
2355static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2356{
2357        switch (s) {
2358        case RT_SCOPE_UNIVERSE: return "universe";
2359        case RT_SCOPE_SITE:     return "site";
2360        case RT_SCOPE_LINK:     return "link";
2361        case RT_SCOPE_HOST:     return "host";
2362        case RT_SCOPE_NOWHERE:  return "nowhere";
2363        default:
2364                snprintf(buf, len, "scope=%d", s);
2365                return buf;
2366        }
2367}
2368
2369static const char *const rtn_type_names[__RTN_MAX] = {
2370        [RTN_UNSPEC] = "UNSPEC",
2371        [RTN_UNICAST] = "UNICAST",
2372        [RTN_LOCAL] = "LOCAL",
2373        [RTN_BROADCAST] = "BROADCAST",
2374        [RTN_ANYCAST] = "ANYCAST",
2375        [RTN_MULTICAST] = "MULTICAST",
2376        [RTN_BLACKHOLE] = "BLACKHOLE",
2377        [RTN_UNREACHABLE] = "UNREACHABLE",
2378        [RTN_PROHIBIT] = "PROHIBIT",
2379        [RTN_THROW] = "THROW",
2380        [RTN_NAT] = "NAT",
2381        [RTN_XRESOLVE] = "XRESOLVE",
2382};
2383
2384static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2385{
2386        if (t < __RTN_MAX && rtn_type_names[t])
2387                return rtn_type_names[t];
2388        snprintf(buf, len, "type %u", t);
2389        return buf;
2390}
2391
2392/* Pretty print the trie */
2393static int fib_trie_seq_show(struct seq_file *seq, void *v)
2394{
2395        const struct fib_trie_iter *iter = seq->private;
2396        struct node *n = v;
2397
2398        if (!node_parent_rcu(n))
2399                fib_table_print(seq, iter->tb);
2400
2401        if (IS_TNODE(n)) {
2402                struct tnode *tn = (struct tnode *) n;
2403                __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2404
2405                seq_indent(seq, iter->depth-1);
2406                seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
2407                           &prf, tn->pos, tn->bits, tn->full_children,
2408                           tn->empty_children);
2409
2410        } else {
2411                struct leaf *l = (struct leaf *) n;
2412                struct leaf_info *li;
2413                struct hlist_node *node;
2414                __be32 val = htonl(l->key);
2415
2416                seq_indent(seq, iter->depth);
2417                seq_printf(seq, "  |-- %pI4\n", &val);
2418
2419                hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2420                        struct fib_alias *fa;
2421
2422                        list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2423                                char buf1[32], buf2[32];
2424
2425                                seq_indent(seq, iter->depth+1);
2426                                seq_printf(seq, "  /%d %s %s", li->plen,
2427                                           rtn_scope(buf1, sizeof(buf1),
2428                                                     fa->fa_scope),
2429                                           rtn_type(buf2, sizeof(buf2),
2430                                                    fa->fa_type));
2431                                if (fa->fa_tos)
2432                                        seq_printf(seq, " tos=%d", fa->fa_tos);
2433                                seq_putc(seq, '\n');
2434                        }
2435                }
2436        }
2437
2438        return 0;
2439}
2440
2441static const struct seq_operations fib_trie_seq_ops = {
2442        .start  = fib_trie_seq_start,
2443        .next   = fib_trie_seq_next,
2444        .stop   = fib_trie_seq_stop,
2445        .show   = fib_trie_seq_show,
2446};
2447
2448static int fib_trie_seq_open(struct inode *inode, struct file *file)
2449{
2450        return seq_open_net(inode, file, &fib_trie_seq_ops,
2451                            sizeof(struct fib_trie_iter));
2452}
2453
2454static const struct file_operations fib_trie_fops = {
2455        .owner  = THIS_MODULE,
2456        .open   = fib_trie_seq_open,
2457        .read   = seq_read,
2458        .llseek = seq_lseek,
2459        .release = seq_release_net,
2460};
2461
2462struct fib_route_iter {
2463        struct seq_net_private p;
2464        struct trie *main_trie;
2465        loff_t  pos;
2466        t_key   key;
2467};
2468
2469static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2470{
2471        struct leaf *l = NULL;
2472        struct trie *t = iter->main_trie;
2473
2474        /* use cache location of last found key */
2475        if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2476                pos -= iter->pos;
2477        else {
2478                iter->pos = 0;
2479                l = trie_firstleaf(t);
2480        }
2481
2482        while (l && pos-- > 0) {
2483                iter->pos++;
2484                l = trie_nextleaf(l);
2485        }
2486
2487        if (l)
2488                iter->key = pos;        /* remember it */
2489        else
2490                iter->pos = 0;          /* forget it */
2491
2492        return l;
2493}
2494
2495static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2496        __acquires(RCU)
2497{
2498        struct fib_route_iter *iter = seq->private;
2499        struct fib_table *tb;
2500
2501        rcu_read_lock();
2502        tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2503        if (!tb)
2504                return NULL;
2505
2506        iter->main_trie = (struct trie *) tb->tb_data;
2507        if (*pos == 0)
2508                return SEQ_START_TOKEN;
2509        else
2510                return fib_route_get_idx(iter, *pos - 1);
2511}
2512
2513static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2514{
2515        struct fib_route_iter *iter = seq->private;
2516        struct leaf *l = v;
2517
2518        ++*pos;
2519        if (v == SEQ_START_TOKEN) {
2520                iter->pos = 0;
2521                l = trie_firstleaf(iter->main_trie);
2522        } else {
2523                iter->pos++;
2524                l = trie_nextleaf(l);
2525        }
2526
2527        if (l)
2528                iter->key = l->key;
2529        else
2530                iter->pos = 0;
2531        return l;
2532}
2533
2534static void fib_route_seq_stop(struct seq_file *seq, void *v)
2535        __releases(RCU)
2536{
2537        rcu_read_unlock();
2538}
2539
2540static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2541{
2542        static unsigned type2flags[RTN_MAX + 1] = {
2543                [7] = RTF_REJECT, [8] = RTF_REJECT,
2544        };
2545        unsigned flags = type2flags[type];
2546
2547        if (fi && fi->fib_nh->nh_gw)
2548                flags |= RTF_GATEWAY;
2549        if (mask == htonl(0xFFFFFFFF))
2550                flags |= RTF_HOST;
2551        flags |= RTF_UP;
2552        return flags;
2553}
2554
2555/*
2556 *      This outputs /proc/net/route.
2557 *      The format of the file is not supposed to be changed
2558 *      and needs to be same as fib_hash output to avoid breaking
2559 *      legacy utilities
2560 */
2561static int fib_route_seq_show(struct seq_file *seq, void *v)
2562{
2563        struct leaf *l = v;
2564        struct leaf_info *li;
2565        struct hlist_node *node;
2566
2567        if (v == SEQ_START_TOKEN) {
2568                seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2569                           "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2570                           "\tWindow\tIRTT");
2571                return 0;
2572        }
2573
2574        hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2575                struct fib_alias *fa;
2576                __be32 mask, prefix;
2577
2578                mask = inet_make_mask(li->plen);
2579                prefix = htonl(l->key);
2580
2581                list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2582                        const struct fib_info *fi = fa->fa_info;
2583                        unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2584                        int len;
2585
2586                        if (fa->fa_type == RTN_BROADCAST
2587                            || fa->fa_type == RTN_MULTICAST)
2588                                continue;
2589
2590                        if (fi)
2591                                seq_printf(seq,
2592                                         "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2593                                         "%d\t%08X\t%d\t%u\t%u%n",
2594                                         fi->fib_dev ? fi->fib_dev->name : "*",
2595                                         prefix,
2596                                         fi->fib_nh->nh_gw, flags, 0, 0,
2597                                         fi->fib_priority,
2598                                         mask,
2599                                         (fi->fib_advmss ?
2600                                          fi->fib_advmss + 40 : 0),
2601                                         fi->fib_window,
2602                                         fi->fib_rtt >> 3, &len);
2603                        else
2604                                seq_printf(seq,
2605                                         "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2606                                         "%d\t%08X\t%d\t%u\t%u%n",
2607                                         prefix, 0, flags, 0, 0, 0,
2608                                         mask, 0, 0, 0, &len);
2609
2610                        seq_printf(seq, "%*s\n", 127 - len, "");
2611                }
2612        }
2613
2614        return 0;
2615}
2616
2617static const struct seq_operations fib_route_seq_ops = {
2618        .start  = fib_route_seq_start,
2619        .next   = fib_route_seq_next,
2620        .stop   = fib_route_seq_stop,
2621        .show   = fib_route_seq_show,
2622};
2623
2624static int fib_route_seq_open(struct inode *inode, struct file *file)
2625{
2626        return seq_open_net(inode, file, &fib_route_seq_ops,
2627                            sizeof(struct fib_route_iter));
2628}
2629
2630static const struct file_operations fib_route_fops = {
2631        .owner  = THIS_MODULE,
2632        .open   = fib_route_seq_open,
2633        .read   = seq_read,
2634        .llseek = seq_lseek,
2635        .release = seq_release_net,
2636};
2637
2638int __net_init fib_proc_init(struct net *net)
2639{
2640        if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2641                goto out1;
2642
2643        if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2644                                  &fib_triestat_fops))
2645                goto out2;
2646
2647        if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2648                goto out3;
2649
2650        return 0;
2651
2652out3:
2653        proc_net_remove(net, "fib_triestat");
2654out2:
2655        proc_net_remove(net, "fib_trie");
2656out1:
2657        return -ENOMEM;
2658}
2659
2660void __net_exit fib_proc_exit(struct net *net)
2661{
2662        proc_net_remove(net, "fib_trie");
2663        proc_net_remove(net, "fib_triestat");
2664        proc_net_remove(net, "route");
2665}
2666
2667#endif /* CONFIG_PROC_FS */
2668
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.