linux/net/ipv4/fib_trie.c
<<
>>
Prefs
   1/*
   2 *   This program is free software; you can redistribute it and/or
   3 *   modify it under the terms of the GNU General Public License
   4 *   as published by the Free Software Foundation; either version
   5 *   2 of the License, or (at your option) any later version.
   6 *
   7 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
   8 *     & Swedish University of Agricultural Sciences.
   9 *
  10 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
  11 *     Agricultural Sciences.
  12 *
  13 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
  14 *
  15 * This work is based on the LPC-trie which is originally described in:
  16 *
  17 * An experimental study of compression methods for dynamic tries
  18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
  19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
  20 *
  21 *
  22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
  23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
  24 *
  25 *
  26 * Code from fib_hash has been reused which includes the following header:
  27 *
  28 *
  29 * INET         An implementation of the TCP/IP protocol suite for the LINUX
  30 *              operating system.  INET is implemented using the  BSD Socket
  31 *              interface as the means of communication with the user level.
  32 *
  33 *              IPv4 FIB: lookup engine and maintenance routines.
  34 *
  35 *
  36 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  37 *
  38 *              This program is free software; you can redistribute it and/or
  39 *              modify it under the terms of the GNU General Public License
  40 *              as published by the Free Software Foundation; either version
  41 *              2 of the License, or (at your option) any later version.
  42 *
  43 * Substantial contributions to this work comes from:
  44 *
  45 *              David S. Miller, <davem@davemloft.net>
  46 *              Stephen Hemminger <shemminger@osdl.org>
  47 *              Paul E. McKenney <paulmck@us.ibm.com>
  48 *              Patrick McHardy <kaber@trash.net>
  49 */
  50
  51#define VERSION "0.409"
  52
  53#include <asm/uaccess.h>
  54#include <linux/bitops.h>
  55#include <linux/types.h>
  56#include <linux/kernel.h>
  57#include <linux/mm.h>
  58#include <linux/string.h>
  59#include <linux/socket.h>
  60#include <linux/sockios.h>
  61#include <linux/errno.h>
  62#include <linux/in.h>
  63#include <linux/inet.h>
  64#include <linux/inetdevice.h>
  65#include <linux/netdevice.h>
  66#include <linux/if_arp.h>
  67#include <linux/proc_fs.h>
  68#include <linux/rcupdate.h>
  69#include <linux/skbuff.h>
  70#include <linux/netlink.h>
  71#include <linux/init.h>
  72#include <linux/list.h>
  73#include <linux/slab.h>
  74#include <linux/prefetch.h>
  75#include <linux/export.h>
  76#include <net/net_namespace.h>
  77#include <net/ip.h>
  78#include <net/protocol.h>
  79#include <net/route.h>
  80#include <net/tcp.h>
  81#include <net/sock.h>
  82#include <net/ip_fib.h>
  83#include "fib_lookup.h"
  84
  85#define MAX_STAT_DEPTH 32
  86
  87#define KEYLENGTH (8*sizeof(t_key))
  88
  89typedef unsigned int t_key;
  90
  91#define T_TNODE 0
  92#define T_LEAF  1
  93#define NODE_TYPE_MASK  0x1UL
  94#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
  95
  96#define IS_TNODE(n) (!(n->parent & T_LEAF))
  97#define IS_LEAF(n) (n->parent & T_LEAF)
  98
  99struct rt_trie_node {
 100        unsigned long parent;
 101        t_key key;
 102};
 103
 104struct leaf {
 105        unsigned long parent;
 106        t_key key;
 107        struct hlist_head list;
 108        struct rcu_head rcu;
 109};
 110
 111struct leaf_info {
 112        struct hlist_node hlist;
 113        int plen;
 114        u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
 115        struct list_head falh;
 116        struct rcu_head rcu;
 117};
 118
 119struct tnode {
 120        unsigned long parent;
 121        t_key key;
 122        unsigned char pos;              /* 2log(KEYLENGTH) bits needed */
 123        unsigned char bits;             /* 2log(KEYLENGTH) bits needed */
 124        unsigned int full_children;     /* KEYLENGTH bits needed */
 125        unsigned int empty_children;    /* KEYLENGTH bits needed */
 126        union {
 127                struct rcu_head rcu;
 128                struct work_struct work;
 129                struct tnode *tnode_free;
 130        };
 131        struct rt_trie_node __rcu *child[0];
 132};
 133
 134#ifdef CONFIG_IP_FIB_TRIE_STATS
 135struct trie_use_stats {
 136        unsigned int gets;
 137        unsigned int backtrack;
 138        unsigned int semantic_match_passed;
 139        unsigned int semantic_match_miss;
 140        unsigned int null_node_hit;
 141        unsigned int resize_node_skipped;
 142};
 143#endif
 144
 145struct trie_stat {
 146        unsigned int totdepth;
 147        unsigned int maxdepth;
 148        unsigned int tnodes;
 149        unsigned int leaves;
 150        unsigned int nullpointers;
 151        unsigned int prefixes;
 152        unsigned int nodesizes[MAX_STAT_DEPTH];
 153};
 154
 155struct trie {
 156        struct rt_trie_node __rcu *trie;
 157#ifdef CONFIG_IP_FIB_TRIE_STATS
 158        struct trie_use_stats stats;
 159#endif
 160};
 161
 162static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
 163                                  int wasfull);
 164static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
 165static struct tnode *inflate(struct trie *t, struct tnode *tn);
 166static struct tnode *halve(struct trie *t, struct tnode *tn);
 167/* tnodes to free after resize(); protected by RTNL */
 168static struct tnode *tnode_free_head;
 169static size_t tnode_free_size;
 170
 171/*
 172 * synchronize_rcu after call_rcu for that many pages; it should be especially
 173 * useful before resizing the root node with PREEMPT_NONE configs; the value was
 174 * obtained experimentally, aiming to avoid visible slowdown.
 175 */
 176static const int sync_pages = 128;
 177
 178static struct kmem_cache *fn_alias_kmem __read_mostly;
 179static struct kmem_cache *trie_leaf_kmem __read_mostly;
 180
 181/*
 182 * caller must hold RTNL
 183 */
 184static inline struct tnode *node_parent(const struct rt_trie_node *node)
 185{
 186        unsigned long parent;
 187
 188        parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
 189
 190        return (struct tnode *)(parent & ~NODE_TYPE_MASK);
 191}
 192
 193/*
 194 * caller must hold RCU read lock or RTNL
 195 */
 196static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
 197{
 198        unsigned long parent;
 199
 200        parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
 201                                                           lockdep_rtnl_is_held());
 202
 203        return (struct tnode *)(parent & ~NODE_TYPE_MASK);
 204}
 205
 206/* Same as rcu_assign_pointer
 207 * but that macro() assumes that value is a pointer.
 208 */
 209static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
 210{
 211        smp_wmb();
 212        node->parent = (unsigned long)ptr | NODE_TYPE(node);
 213}
 214
 215/*
 216 * caller must hold RTNL
 217 */
 218static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
 219{
 220        BUG_ON(i >= 1U << tn->bits);
 221
 222        return rtnl_dereference(tn->child[i]);
 223}
 224
 225/*
 226 * caller must hold RCU read lock or RTNL
 227 */
 228static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
 229{
 230        BUG_ON(i >= 1U << tn->bits);
 231
 232        return rcu_dereference_rtnl(tn->child[i]);
 233}
 234
 235static inline int tnode_child_length(const struct tnode *tn)
 236{
 237        return 1 << tn->bits;
 238}
 239
 240static inline t_key mask_pfx(t_key k, unsigned int l)
 241{
 242        return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
 243}
 244
 245static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
 246{
 247        if (offset < KEYLENGTH)
 248                return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
 249        else
 250                return 0;
 251}
 252
 253static inline int tkey_equals(t_key a, t_key b)
 254{
 255        return a == b;
 256}
 257
 258static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
 259{
 260        if (bits == 0 || offset >= KEYLENGTH)
 261                return 1;
 262        bits = bits > KEYLENGTH ? KEYLENGTH : bits;
 263        return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
 264}
 265
 266static inline int tkey_mismatch(t_key a, int offset, t_key b)
 267{
 268        t_key diff = a ^ b;
 269        int i = offset;
 270
 271        if (!diff)
 272                return 0;
 273        while ((diff << i) >> (KEYLENGTH-1) == 0)
 274                i++;
 275        return i;
 276}
 277
 278/*
 279  To understand this stuff, an understanding of keys and all their bits is
 280  necessary. Every node in the trie has a key associated with it, but not
 281  all of the bits in that key are significant.
 282
 283  Consider a node 'n' and its parent 'tp'.
 284
 285  If n is a leaf, every bit in its key is significant. Its presence is
 286  necessitated by path compression, since during a tree traversal (when
 287  searching for a leaf - unless we are doing an insertion) we will completely
 288  ignore all skipped bits we encounter. Thus we need to verify, at the end of
 289  a potentially successful search, that we have indeed been walking the
 290  correct key path.
 291
 292  Note that we can never "miss" the correct key in the tree if present by
 293  following the wrong path. Path compression ensures that segments of the key
 294  that are the same for all keys with a given prefix are skipped, but the
 295  skipped part *is* identical for each node in the subtrie below the skipped
 296  bit! trie_insert() in this implementation takes care of that - note the
 297  call to tkey_sub_equals() in trie_insert().
 298
 299  if n is an internal node - a 'tnode' here, the various parts of its key
 300  have many different meanings.
 301
 302  Example:
 303  _________________________________________________________________
 304  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
 305  -----------------------------------------------------------------
 306    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 307
 308  _________________________________________________________________
 309  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
 310  -----------------------------------------------------------------
 311   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
 312
 313  tp->pos = 7
 314  tp->bits = 3
 315  n->pos = 15
 316  n->bits = 4
 317
 318  First, let's just ignore the bits that come before the parent tp, that is
 319  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
 320  not use them for anything.
 321
 322  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
 323  index into the parent's child array. That is, they will be used to find
 324  'n' among tp's children.
 325
 326  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
 327  for the node n.
 328
 329  All the bits we have seen so far are significant to the node n. The rest
 330  of the bits are really not needed or indeed known in n->key.
 331
 332  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
 333  n's child array, and will of course be different for each child.
 334
 335
 336  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
 337  at this point.
 338
 339*/
 340
 341static inline void check_tnode(const struct tnode *tn)
 342{
 343        WARN_ON(tn && tn->pos+tn->bits > 32);
 344}
 345
 346static const int halve_threshold = 25;
 347static const int inflate_threshold = 50;
 348static const int halve_threshold_root = 15;
 349static const int inflate_threshold_root = 30;
 350
 351static void __alias_free_mem(struct rcu_head *head)
 352{
 353        struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
 354        kmem_cache_free(fn_alias_kmem, fa);
 355}
 356
 357static inline void alias_free_mem_rcu(struct fib_alias *fa)
 358{
 359        call_rcu(&fa->rcu, __alias_free_mem);
 360}
 361
 362static void __leaf_free_rcu(struct rcu_head *head)
 363{
 364        struct leaf *l = container_of(head, struct leaf, rcu);
 365        kmem_cache_free(trie_leaf_kmem, l);
 366}
 367
 368static inline void free_leaf(struct leaf *l)
 369{
 370        call_rcu(&l->rcu, __leaf_free_rcu);
 371}
 372
 373static inline void free_leaf_info(struct leaf_info *leaf)
 374{
 375        kfree_rcu(leaf, rcu);
 376}
 377
 378static struct tnode *tnode_alloc(size_t size)
 379{
 380        if (size <= PAGE_SIZE)
 381                return kzalloc(size, GFP_KERNEL);
 382        else
 383                return vzalloc(size);
 384}
 385
 386static void __tnode_vfree(struct work_struct *arg)
 387{
 388        struct tnode *tn = container_of(arg, struct tnode, work);
 389        vfree(tn);
 390}
 391
 392static void __tnode_free_rcu(struct rcu_head *head)
 393{
 394        struct tnode *tn = container_of(head, struct tnode, rcu);
 395        size_t size = sizeof(struct tnode) +
 396                      (sizeof(struct rt_trie_node *) << tn->bits);
 397
 398        if (size <= PAGE_SIZE)
 399                kfree(tn);
 400        else {
 401                INIT_WORK(&tn->work, __tnode_vfree);
 402                schedule_work(&tn->work);
 403        }
 404}
 405
 406static inline void tnode_free(struct tnode *tn)
 407{
 408        if (IS_LEAF(tn))
 409                free_leaf((struct leaf *) tn);
 410        else
 411                call_rcu(&tn->rcu, __tnode_free_rcu);
 412}
 413
 414static void tnode_free_safe(struct tnode *tn)
 415{
 416        BUG_ON(IS_LEAF(tn));
 417        tn->tnode_free = tnode_free_head;
 418        tnode_free_head = tn;
 419        tnode_free_size += sizeof(struct tnode) +
 420                           (sizeof(struct rt_trie_node *) << tn->bits);
 421}
 422
 423static void tnode_free_flush(void)
 424{
 425        struct tnode *tn;
 426
 427        while ((tn = tnode_free_head)) {
 428                tnode_free_head = tn->tnode_free;
 429                tn->tnode_free = NULL;
 430                tnode_free(tn);
 431        }
 432
 433        if (tnode_free_size >= PAGE_SIZE * sync_pages) {
 434                tnode_free_size = 0;
 435                synchronize_rcu();
 436        }
 437}
 438
 439static struct leaf *leaf_new(void)
 440{
 441        struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
 442        if (l) {
 443                l->parent = T_LEAF;
 444                INIT_HLIST_HEAD(&l->list);
 445        }
 446        return l;
 447}
 448
 449static struct leaf_info *leaf_info_new(int plen)
 450{
 451        struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
 452        if (li) {
 453                li->plen = plen;
 454                li->mask_plen = ntohl(inet_make_mask(plen));
 455                INIT_LIST_HEAD(&li->falh);
 456        }
 457        return li;
 458}
 459
 460static struct tnode *tnode_new(t_key key, int pos, int bits)
 461{
 462        size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
 463        struct tnode *tn = tnode_alloc(sz);
 464
 465        if (tn) {
 466                tn->parent = T_TNODE;
 467                tn->pos = pos;
 468                tn->bits = bits;
 469                tn->key = key;
 470                tn->full_children = 0;
 471                tn->empty_children = 1<<bits;
 472        }
 473
 474        pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
 475                 sizeof(struct rt_trie_node *) << bits);
 476        return tn;
 477}
 478
 479/*
 480 * Check whether a tnode 'n' is "full", i.e. it is an internal node
 481 * and no bits are skipped. See discussion in dyntree paper p. 6
 482 */
 483
 484static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
 485{
 486        if (n == NULL || IS_LEAF(n))
 487                return 0;
 488
 489        return ((struct tnode *) n)->pos == tn->pos + tn->bits;
 490}
 491
 492static inline void put_child(struct tnode *tn, int i,
 493                             struct rt_trie_node *n)
 494{
 495        tnode_put_child_reorg(tn, i, n, -1);
 496}
 497
 498 /*
 499  * Add a child at position i overwriting the old value.
 500  * Update the value of full_children and empty_children.
 501  */
 502
 503static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
 504                                  int wasfull)
 505{
 506        struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
 507        int isfull;
 508
 509        BUG_ON(i >= 1<<tn->bits);
 510
 511        /* update emptyChildren */
 512        if (n == NULL && chi != NULL)
 513                tn->empty_children++;
 514        else if (n != NULL && chi == NULL)
 515                tn->empty_children--;
 516
 517        /* update fullChildren */
 518        if (wasfull == -1)
 519                wasfull = tnode_full(tn, chi);
 520
 521        isfull = tnode_full(tn, n);
 522        if (wasfull && !isfull)
 523                tn->full_children--;
 524        else if (!wasfull && isfull)
 525                tn->full_children++;
 526
 527        if (n)
 528                node_set_parent(n, tn);
 529
 530        rcu_assign_pointer(tn->child[i], n);
 531}
 532
 533#define MAX_WORK 10
 534static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
 535{
 536        int i;
 537        struct tnode *old_tn;
 538        int inflate_threshold_use;
 539        int halve_threshold_use;
 540        int max_work;
 541
 542        if (!tn)
 543                return NULL;
 544
 545        pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
 546                 tn, inflate_threshold, halve_threshold);
 547
 548        /* No children */
 549        if (tn->empty_children == tnode_child_length(tn)) {
 550                tnode_free_safe(tn);
 551                return NULL;
 552        }
 553        /* One child */
 554        if (tn->empty_children == tnode_child_length(tn) - 1)
 555                goto one_child;
 556        /*
 557         * Double as long as the resulting node has a number of
 558         * nonempty nodes that are above the threshold.
 559         */
 560
 561        /*
 562         * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
 563         * the Helsinki University of Technology and Matti Tikkanen of Nokia
 564         * Telecommunications, page 6:
 565         * "A node is doubled if the ratio of non-empty children to all
 566         * children in the *doubled* node is at least 'high'."
 567         *
 568         * 'high' in this instance is the variable 'inflate_threshold'. It
 569         * is expressed as a percentage, so we multiply it with
 570         * tnode_child_length() and instead of multiplying by 2 (since the
 571         * child array will be doubled by inflate()) and multiplying
 572         * the left-hand side by 100 (to handle the percentage thing) we
 573         * multiply the left-hand side by 50.
 574         *
 575         * The left-hand side may look a bit weird: tnode_child_length(tn)
 576         * - tn->empty_children is of course the number of non-null children
 577         * in the current node. tn->full_children is the number of "full"
 578         * children, that is non-null tnodes with a skip value of 0.
 579         * All of those will be doubled in the resulting inflated tnode, so
 580         * we just count them one extra time here.
 581         *
 582         * A clearer way to write this would be:
 583         *
 584         * to_be_doubled = tn->full_children;
 585         * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
 586         *     tn->full_children;
 587         *
 588         * new_child_length = tnode_child_length(tn) * 2;
 589         *
 590         * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
 591         *      new_child_length;
 592         * if (new_fill_factor >= inflate_threshold)
 593         *
 594         * ...and so on, tho it would mess up the while () loop.
 595         *
 596         * anyway,
 597         * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
 598         *      inflate_threshold
 599         *
 600         * avoid a division:
 601         * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
 602         *      inflate_threshold * new_child_length
 603         *
 604         * expand not_to_be_doubled and to_be_doubled, and shorten:
 605         * 100 * (tnode_child_length(tn) - tn->empty_children +
 606         *    tn->full_children) >= inflate_threshold * new_child_length
 607         *
 608         * expand new_child_length:
 609         * 100 * (tnode_child_length(tn) - tn->empty_children +
 610         *    tn->full_children) >=
 611         *      inflate_threshold * tnode_child_length(tn) * 2
 612         *
 613         * shorten again:
 614         * 50 * (tn->full_children + tnode_child_length(tn) -
 615         *    tn->empty_children) >= inflate_threshold *
 616         *    tnode_child_length(tn)
 617         *
 618         */
 619
 620        check_tnode(tn);
 621
 622        /* Keep root node larger  */
 623
 624        if (!node_parent((struct rt_trie_node *)tn)) {
 625                inflate_threshold_use = inflate_threshold_root;
 626                halve_threshold_use = halve_threshold_root;
 627        } else {
 628                inflate_threshold_use = inflate_threshold;
 629                halve_threshold_use = halve_threshold;
 630        }
 631
 632        max_work = MAX_WORK;
 633        while ((tn->full_children > 0 &&  max_work-- &&
 634                50 * (tn->full_children + tnode_child_length(tn)
 635                      - tn->empty_children)
 636                >= inflate_threshold_use * tnode_child_length(tn))) {
 637
 638                old_tn = tn;
 639                tn = inflate(t, tn);
 640
 641                if (IS_ERR(tn)) {
 642                        tn = old_tn;
 643#ifdef CONFIG_IP_FIB_TRIE_STATS
 644                        t->stats.resize_node_skipped++;
 645#endif
 646                        break;
 647                }
 648        }
 649
 650        check_tnode(tn);
 651
 652        /* Return if at least one inflate is run */
 653        if (max_work != MAX_WORK)
 654                return (struct rt_trie_node *) tn;
 655
 656        /*
 657         * Halve as long as the number of empty children in this
 658         * node is above threshold.
 659         */
 660
 661        max_work = MAX_WORK;
 662        while (tn->bits > 1 &&  max_work-- &&
 663               100 * (tnode_child_length(tn) - tn->empty_children) <
 664               halve_threshold_use * tnode_child_length(tn)) {
 665
 666                old_tn = tn;
 667                tn = halve(t, tn);
 668                if (IS_ERR(tn)) {
 669                        tn = old_tn;
 670#ifdef CONFIG_IP_FIB_TRIE_STATS
 671                        t->stats.resize_node_skipped++;
 672#endif
 673                        break;
 674                }
 675        }
 676
 677
 678        /* Only one child remains */
 679        if (tn->empty_children == tnode_child_length(tn) - 1) {
 680one_child:
 681                for (i = 0; i < tnode_child_length(tn); i++) {
 682                        struct rt_trie_node *n;
 683
 684                        n = rtnl_dereference(tn->child[i]);
 685                        if (!n)
 686                                continue;
 687
 688                        /* compress one level */
 689
 690                        node_set_parent(n, NULL);
 691                        tnode_free_safe(tn);
 692                        return n;
 693                }
 694        }
 695        return (struct rt_trie_node *) tn;
 696}
 697
 698
 699static void tnode_clean_free(struct tnode *tn)
 700{
 701        int i;
 702        struct tnode *tofree;
 703
 704        for (i = 0; i < tnode_child_length(tn); i++) {
 705                tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
 706                if (tofree)
 707                        tnode_free(tofree);
 708        }
 709        tnode_free(tn);
 710}
 711
 712static struct tnode *inflate(struct trie *t, struct tnode *tn)
 713{
 714        struct tnode *oldtnode = tn;
 715        int olen = tnode_child_length(tn);
 716        int i;
 717
 718        pr_debug("In inflate\n");
 719
 720        tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
 721
 722        if (!tn)
 723                return ERR_PTR(-ENOMEM);
 724
 725        /*
 726         * Preallocate and store tnodes before the actual work so we
 727         * don't get into an inconsistent state if memory allocation
 728         * fails. In case of failure we return the oldnode and  inflate
 729         * of tnode is ignored.
 730         */
 731
 732        for (i = 0; i < olen; i++) {
 733                struct tnode *inode;
 734
 735                inode = (struct tnode *) tnode_get_child(oldtnode, i);
 736                if (inode &&
 737                    IS_TNODE(inode) &&
 738                    inode->pos == oldtnode->pos + oldtnode->bits &&
 739                    inode->bits > 1) {
 740                        struct tnode *left, *right;
 741                        t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
 742
 743                        left = tnode_new(inode->key&(~m), inode->pos + 1,
 744                                         inode->bits - 1);
 745                        if (!left)
 746                                goto nomem;
 747
 748                        right = tnode_new(inode->key|m, inode->pos + 1,
 749                                          inode->bits - 1);
 750
 751                        if (!right) {
 752                                tnode_free(left);
 753                                goto nomem;
 754                        }
 755
 756                        put_child(tn, 2*i, (struct rt_trie_node *) left);
 757                        put_child(tn, 2*i+1, (struct rt_trie_node *) right);
 758                }
 759        }
 760
 761        for (i = 0; i < olen; i++) {
 762                struct tnode *inode;
 763                struct rt_trie_node *node = tnode_get_child(oldtnode, i);
 764                struct tnode *left, *right;
 765                int size, j;
 766
 767                /* An empty child */
 768                if (node == NULL)
 769                        continue;
 770
 771                /* A leaf or an internal node with skipped bits */
 772
 773                if (IS_LEAF(node) || ((struct tnode *) node)->pos >
 774                   tn->pos + tn->bits - 1) {
 775                        if (tkey_extract_bits(node->key,
 776                                              oldtnode->pos + oldtnode->bits,
 777                                              1) == 0)
 778                                put_child(tn, 2*i, node);
 779                        else
 780                                put_child(tn, 2*i+1, node);
 781                        continue;
 782                }
 783
 784                /* An internal node with two children */
 785                inode = (struct tnode *) node;
 786
 787                if (inode->bits == 1) {
 788                        put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
 789                        put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
 790
 791                        tnode_free_safe(inode);
 792                        continue;
 793                }
 794
 795                /* An internal node with more than two children */
 796
 797                /* We will replace this node 'inode' with two new
 798                 * ones, 'left' and 'right', each with half of the
 799                 * original children. The two new nodes will have
 800                 * a position one bit further down the key and this
 801                 * means that the "significant" part of their keys
 802                 * (see the discussion near the top of this file)
 803                 * will differ by one bit, which will be "0" in
 804                 * left's key and "1" in right's key. Since we are
 805                 * moving the key position by one step, the bit that
 806                 * we are moving away from - the bit at position
 807                 * (inode->pos) - is the one that will differ between
 808                 * left and right. So... we synthesize that bit in the
 809                 * two  new keys.
 810                 * The mask 'm' below will be a single "one" bit at
 811                 * the position (inode->pos)
 812                 */
 813
 814                /* Use the old key, but set the new significant
 815                 *   bit to zero.
 816                 */
 817
 818                left = (struct tnode *) tnode_get_child(tn, 2*i);
 819                put_child(tn, 2*i, NULL);
 820
 821                BUG_ON(!left);
 822
 823                right = (struct tnode *) tnode_get_child(tn, 2*i+1);
 824                put_child(tn, 2*i+1, NULL);
 825
 826                BUG_ON(!right);
 827
 828                size = tnode_child_length(left);
 829                for (j = 0; j < size; j++) {
 830                        put_child(left, j, rtnl_dereference(inode->child[j]));
 831                        put_child(right, j, rtnl_dereference(inode->child[j + size]));
 832                }
 833                put_child(tn, 2*i, resize(t, left));
 834                put_child(tn, 2*i+1, resize(t, right));
 835
 836                tnode_free_safe(inode);
 837        }
 838        tnode_free_safe(oldtnode);
 839        return tn;
 840nomem:
 841        tnode_clean_free(tn);
 842        return ERR_PTR(-ENOMEM);
 843}
 844
 845static struct tnode *halve(struct trie *t, struct tnode *tn)
 846{
 847        struct tnode *oldtnode = tn;
 848        struct rt_trie_node *left, *right;
 849        int i;
 850        int olen = tnode_child_length(tn);
 851
 852        pr_debug("In halve\n");
 853
 854        tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
 855
 856        if (!tn)
 857                return ERR_PTR(-ENOMEM);
 858
 859        /*
 860         * Preallocate and store tnodes before the actual work so we
 861         * don't get into an inconsistent state if memory allocation
 862         * fails. In case of failure we return the oldnode and halve
 863         * of tnode is ignored.
 864         */
 865
 866        for (i = 0; i < olen; i += 2) {
 867                left = tnode_get_child(oldtnode, i);
 868                right = tnode_get_child(oldtnode, i+1);
 869
 870                /* Two nonempty children */
 871                if (left && right) {
 872                        struct tnode *newn;
 873
 874                        newn = tnode_new(left->key, tn->pos + tn->bits, 1);
 875
 876                        if (!newn)
 877                                goto nomem;
 878
 879                        put_child(tn, i/2, (struct rt_trie_node *)newn);
 880                }
 881
 882        }
 883
 884        for (i = 0; i < olen; i += 2) {
 885                struct tnode *newBinNode;
 886
 887                left = tnode_get_child(oldtnode, i);
 888                right = tnode_get_child(oldtnode, i+1);
 889
 890                /* At least one of the children is empty */
 891                if (left == NULL) {
 892                        if (right == NULL)    /* Both are empty */
 893                                continue;
 894                        put_child(tn, i/2, right);
 895                        continue;
 896                }
 897
 898                if (right == NULL) {
 899                        put_child(tn, i/2, left);
 900                        continue;
 901                }
 902
 903                /* Two nonempty children */
 904                newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
 905                put_child(tn, i/2, NULL);
 906                put_child(newBinNode, 0, left);
 907                put_child(newBinNode, 1, right);
 908                put_child(tn, i/2, resize(t, newBinNode));
 909        }
 910        tnode_free_safe(oldtnode);
 911        return tn;
 912nomem:
 913        tnode_clean_free(tn);
 914        return ERR_PTR(-ENOMEM);
 915}
 916
 917/* readside must use rcu_read_lock currently dump routines
 918 via get_fa_head and dump */
 919
 920static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
 921{
 922        struct hlist_head *head = &l->list;
 923        struct hlist_node *node;
 924        struct leaf_info *li;
 925
 926        hlist_for_each_entry_rcu(li, node, head, hlist)
 927                if (li->plen == plen)
 928                        return li;
 929
 930        return NULL;
 931}
 932
 933static inline struct list_head *get_fa_head(struct leaf *l, int plen)
 934{
 935        struct leaf_info *li = find_leaf_info(l, plen);
 936
 937        if (!li)
 938                return NULL;
 939
 940        return &li->falh;
 941}
 942
 943static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
 944{
 945        struct leaf_info *li = NULL, *last = NULL;
 946        struct hlist_node *node;
 947
 948        if (hlist_empty(head)) {
 949                hlist_add_head_rcu(&new->hlist, head);
 950        } else {
 951                hlist_for_each_entry(li, node, head, hlist) {
 952                        if (new->plen > li->plen)
 953                                break;
 954
 955                        last = li;
 956                }
 957                if (last)
 958                        hlist_add_after_rcu(&last->hlist, &new->hlist);
 959                else
 960                        hlist_add_before_rcu(&new->hlist, &li->hlist);
 961        }
 962}
 963
 964/* rcu_read_lock needs to be hold by caller from readside */
 965
 966static struct leaf *
 967fib_find_node(struct trie *t, u32 key)
 968{
 969        int pos;
 970        struct tnode *tn;
 971        struct rt_trie_node *n;
 972
 973        pos = 0;
 974        n = rcu_dereference_rtnl(t->trie);
 975
 976        while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
 977                tn = (struct tnode *) n;
 978
 979                check_tnode(tn);
 980
 981                if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
 982                        pos = tn->pos + tn->bits;
 983                        n = tnode_get_child_rcu(tn,
 984                                                tkey_extract_bits(key,
 985                                                                  tn->pos,
 986                                                                  tn->bits));
 987                } else
 988                        break;
 989        }
 990        /* Case we have found a leaf. Compare prefixes */
 991
 992        if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
 993                return (struct leaf *)n;
 994
 995        return NULL;
 996}
 997
 998static void trie_rebalance(struct trie *t, struct tnode *tn)
 999{
1000        int wasfull;
1001        t_key cindex, key;
1002        struct tnode *tp;
1003
1004        key = tn->key;
1005
1006        while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
1007                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1008                wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1009                tn = (struct tnode *)resize(t, tn);
1010
1011                tnode_put_child_reorg(tp, cindex,
1012                                      (struct rt_trie_node *)tn, wasfull);
1013
1014                tp = node_parent((struct rt_trie_node *) tn);
1015                if (!tp)
1016                        rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1017
1018                tnode_free_flush();
1019                if (!tp)
1020                        break;
1021                tn = tp;
1022        }
1023
1024        /* Handle last (top) tnode */
1025        if (IS_TNODE(tn))
1026                tn = (struct tnode *)resize(t, tn);
1027
1028        rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1029        tnode_free_flush();
1030}
1031
1032/* only used from updater-side */
1033
1034static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1035{
1036        int pos, newpos;
1037        struct tnode *tp = NULL, *tn = NULL;
1038        struct rt_trie_node *n;
1039        struct leaf *l;
1040        int missbit;
1041        struct list_head *fa_head = NULL;
1042        struct leaf_info *li;
1043        t_key cindex;
1044
1045        pos = 0;
1046        n = rtnl_dereference(t->trie);
1047
1048        /* If we point to NULL, stop. Either the tree is empty and we should
1049         * just put a new leaf in if, or we have reached an empty child slot,
1050         * and we should just put our new leaf in that.
1051         * If we point to a T_TNODE, check if it matches our key. Note that
1052         * a T_TNODE might be skipping any number of bits - its 'pos' need
1053         * not be the parent's 'pos'+'bits'!
1054         *
1055         * If it does match the current key, get pos/bits from it, extract
1056         * the index from our key, push the T_TNODE and walk the tree.
1057         *
1058         * If it doesn't, we have to replace it with a new T_TNODE.
1059         *
1060         * If we point to a T_LEAF, it might or might not have the same key
1061         * as we do. If it does, just change the value, update the T_LEAF's
1062         * value, and return it.
1063         * If it doesn't, we need to replace it with a T_TNODE.
1064         */
1065
1066        while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1067                tn = (struct tnode *) n;
1068
1069                check_tnode(tn);
1070
1071                if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1072                        tp = tn;
1073                        pos = tn->pos + tn->bits;
1074                        n = tnode_get_child(tn,
1075                                            tkey_extract_bits(key,
1076                                                              tn->pos,
1077                                                              tn->bits));
1078
1079                        BUG_ON(n && node_parent(n) != tn);
1080                } else
1081                        break;
1082        }
1083
1084        /*
1085         * n  ----> NULL, LEAF or TNODE
1086         *
1087         * tp is n's (parent) ----> NULL or TNODE
1088         */
1089
1090        BUG_ON(tp && IS_LEAF(tp));
1091
1092        /* Case 1: n is a leaf. Compare prefixes */
1093
1094        if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1095                l = (struct leaf *) n;
1096                li = leaf_info_new(plen);
1097
1098                if (!li)
1099                        return NULL;
1100
1101                fa_head = &li->falh;
1102                insert_leaf_info(&l->list, li);
1103                goto done;
1104        }
1105        l = leaf_new();
1106
1107        if (!l)
1108                return NULL;
1109
1110        l->key = key;
1111        li = leaf_info_new(plen);
1112
1113        if (!li) {
1114                free_leaf(l);
1115                return NULL;
1116        }
1117
1118        fa_head = &li->falh;
1119        insert_leaf_info(&l->list, li);
1120
1121        if (t->trie && n == NULL) {
1122                /* Case 2: n is NULL, and will just insert a new leaf */
1123
1124                node_set_parent((struct rt_trie_node *)l, tp);
1125
1126                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1127                put_child(tp, cindex, (struct rt_trie_node *)l);
1128        } else {
1129                /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1130                /*
1131                 *  Add a new tnode here
1132                 *  first tnode need some special handling
1133                 */
1134
1135                if (tp)
1136                        pos = tp->pos+tp->bits;
1137                else
1138                        pos = 0;
1139
1140                if (n) {
1141                        newpos = tkey_mismatch(key, pos, n->key);
1142                        tn = tnode_new(n->key, newpos, 1);
1143                } else {
1144                        newpos = 0;
1145                        tn = tnode_new(key, newpos, 1); /* First tnode */
1146                }
1147
1148                if (!tn) {
1149                        free_leaf_info(li);
1150                        free_leaf(l);
1151                        return NULL;
1152                }
1153
1154                node_set_parent((struct rt_trie_node *)tn, tp);
1155
1156                missbit = tkey_extract_bits(key, newpos, 1);
1157                put_child(tn, missbit, (struct rt_trie_node *)l);
1158                put_child(tn, 1-missbit, n);
1159
1160                if (tp) {
1161                        cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1162                        put_child(tp, cindex, (struct rt_trie_node *)tn);
1163                } else {
1164                        rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1165                        tp = tn;
1166                }
1167        }
1168
1169        if (tp && tp->pos + tp->bits > 32)
1170                pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1171                        tp, tp->pos, tp->bits, key, plen);
1172
1173        /* Rebalance the trie */
1174
1175        trie_rebalance(t, tp);
1176done:
1177        return fa_head;
1178}
1179
1180/*
1181 * Caller must hold RTNL.
1182 */
1183int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1184{
1185        struct trie *t = (struct trie *) tb->tb_data;
1186        struct fib_alias *fa, *new_fa;
1187        struct list_head *fa_head = NULL;
1188        struct fib_info *fi;
1189        int plen = cfg->fc_dst_len;
1190        u8 tos = cfg->fc_tos;
1191        u32 key, mask;
1192        int err;
1193        struct leaf *l;
1194
1195        if (plen > 32)
1196                return -EINVAL;
1197
1198        key = ntohl(cfg->fc_dst);
1199
1200        pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1201
1202        mask = ntohl(inet_make_mask(plen));
1203
1204        if (key & ~mask)
1205                return -EINVAL;
1206
1207        key = key & mask;
1208
1209        fi = fib_create_info(cfg);
1210        if (IS_ERR(fi)) {
1211                err = PTR_ERR(fi);
1212                goto err;
1213        }
1214
1215        l = fib_find_node(t, key);
1216        fa = NULL;
1217
1218        if (l) {
1219                fa_head = get_fa_head(l, plen);
1220                fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1221        }
1222
1223        /* Now fa, if non-NULL, points to the first fib alias
1224         * with the same keys [prefix,tos,priority], if such key already
1225         * exists or to the node before which we will insert new one.
1226         *
1227         * If fa is NULL, we will need to allocate a new one and
1228         * insert to the head of f.
1229         *
1230         * If f is NULL, no fib node matched the destination key
1231         * and we need to allocate a new one of those as well.
1232         */
1233
1234        if (fa && fa->fa_tos == tos &&
1235            fa->fa_info->fib_priority == fi->fib_priority) {
1236                struct fib_alias *fa_first, *fa_match;
1237
1238                err = -EEXIST;
1239                if (cfg->fc_nlflags & NLM_F_EXCL)
1240                        goto out;
1241
1242                /* We have 2 goals:
1243                 * 1. Find exact match for type, scope, fib_info to avoid
1244                 * duplicate routes
1245                 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1246                 */
1247                fa_match = NULL;
1248                fa_first = fa;
1249                fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1250                list_for_each_entry_continue(fa, fa_head, fa_list) {
1251                        if (fa->fa_tos != tos)
1252                                break;
1253                        if (fa->fa_info->fib_priority != fi->fib_priority)
1254                                break;
1255                        if (fa->fa_type == cfg->fc_type &&
1256                            fa->fa_info == fi) {
1257                                fa_match = fa;
1258                                break;
1259                        }
1260                }
1261
1262                if (cfg->fc_nlflags & NLM_F_REPLACE) {
1263                        struct fib_info *fi_drop;
1264                        u8 state;
1265
1266                        fa = fa_first;
1267                        if (fa_match) {
1268                                if (fa == fa_match)
1269                                        err = 0;
1270                                goto out;
1271                        }
1272                        err = -ENOBUFS;
1273                        new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1274                        if (new_fa == NULL)
1275                                goto out;
1276
1277                        fi_drop = fa->fa_info;
1278                        new_fa->fa_tos = fa->fa_tos;
1279                        new_fa->fa_info = fi;
1280                        new_fa->fa_type = cfg->fc_type;
1281                        state = fa->fa_state;
1282                        new_fa->fa_state = state & ~FA_S_ACCESSED;
1283
1284                        list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1285                        alias_free_mem_rcu(fa);
1286
1287                        fib_release_info(fi_drop);
1288                        if (state & FA_S_ACCESSED)
1289                                rt_cache_flush(cfg->fc_nlinfo.nl_net);
1290                        rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1291                                tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1292
1293                        goto succeeded;
1294                }
1295                /* Error if we find a perfect match which
1296                 * uses the same scope, type, and nexthop
1297                 * information.
1298                 */
1299                if (fa_match)
1300                        goto out;
1301
1302                if (!(cfg->fc_nlflags & NLM_F_APPEND))
1303                        fa = fa_first;
1304        }
1305        err = -ENOENT;
1306        if (!(cfg->fc_nlflags & NLM_F_CREATE))
1307                goto out;
1308
1309        err = -ENOBUFS;
1310        new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1311        if (new_fa == NULL)
1312                goto out;
1313
1314        new_fa->fa_info = fi;
1315        new_fa->fa_tos = tos;
1316        new_fa->fa_type = cfg->fc_type;
1317        new_fa->fa_state = 0;
1318        /*
1319         * Insert new entry to the list.
1320         */
1321
1322        if (!fa_head) {
1323                fa_head = fib_insert_node(t, key, plen);
1324                if (unlikely(!fa_head)) {
1325                        err = -ENOMEM;
1326                        goto out_free_new_fa;
1327                }
1328        }
1329
1330        if (!plen)
1331                tb->tb_num_default++;
1332
1333        list_add_tail_rcu(&new_fa->fa_list,
1334                          (fa ? &fa->fa_list : fa_head));
1335
1336        rt_cache_flush(cfg->fc_nlinfo.nl_net);
1337        rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1338                  &cfg->fc_nlinfo, 0);
1339succeeded:
1340        return 0;
1341
1342out_free_new_fa:
1343        kmem_cache_free(fn_alias_kmem, new_fa);
1344out:
1345        fib_release_info(fi);
1346err:
1347        return err;
1348}
1349
1350/* should be called with rcu_read_lock */
1351static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1352                      t_key key,  const struct flowi4 *flp,
1353                      struct fib_result *res, int fib_flags)
1354{
1355        struct leaf_info *li;
1356        struct hlist_head *hhead = &l->list;
1357        struct hlist_node *node;
1358
1359        hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1360                struct fib_alias *fa;
1361
1362                if (l->key != (key & li->mask_plen))
1363                        continue;
1364
1365                list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1366                        struct fib_info *fi = fa->fa_info;
1367                        int nhsel, err;
1368
1369                        if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1370                                continue;
1371                        if (fi->fib_dead)
1372                                continue;
1373                        if (fa->fa_info->fib_scope < flp->flowi4_scope)
1374                                continue;
1375                        fib_alias_accessed(fa);
1376                        err = fib_props[fa->fa_type].error;
1377                        if (err) {
1378#ifdef CONFIG_IP_FIB_TRIE_STATS
1379                                t->stats.semantic_match_passed++;
1380#endif
1381                                return err;
1382                        }
1383                        if (fi->fib_flags & RTNH_F_DEAD)
1384                                continue;
1385                        for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1386                                const struct fib_nh *nh = &fi->fib_nh[nhsel];
1387
1388                                if (nh->nh_flags & RTNH_F_DEAD)
1389                                        continue;
1390                                if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1391                                        continue;
1392
1393#ifdef CONFIG_IP_FIB_TRIE_STATS
1394                                t->stats.semantic_match_passed++;
1395#endif
1396                                res->prefixlen = li->plen;
1397                                res->nh_sel = nhsel;
1398                                res->type = fa->fa_type;
1399                                res->scope = fa->fa_info->fib_scope;
1400                                res->fi = fi;
1401                                res->table = tb;
1402                                res->fa_head = &li->falh;
1403                                if (!(fib_flags & FIB_LOOKUP_NOREF))
1404                                        atomic_inc(&fi->fib_clntref);
1405                                return 0;
1406                        }
1407                }
1408
1409#ifdef CONFIG_IP_FIB_TRIE_STATS
1410                t->stats.semantic_match_miss++;
1411#endif
1412        }
1413
1414        return 1;
1415}
1416
1417int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1418                     struct fib_result *res, int fib_flags)
1419{
1420        struct trie *t = (struct trie *) tb->tb_data;
1421        int ret;
1422        struct rt_trie_node *n;
1423        struct tnode *pn;
1424        unsigned int pos, bits;
1425        t_key key = ntohl(flp->daddr);
1426        unsigned int chopped_off;
1427        t_key cindex = 0;
1428        unsigned int current_prefix_length = KEYLENGTH;
1429        struct tnode *cn;
1430        t_key pref_mismatch;
1431
1432        rcu_read_lock();
1433
1434        n = rcu_dereference(t->trie);
1435        if (!n)
1436                goto failed;
1437
1438#ifdef CONFIG_IP_FIB_TRIE_STATS
1439        t->stats.gets++;
1440#endif
1441
1442        /* Just a leaf? */
1443        if (IS_LEAF(n)) {
1444                ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1445                goto found;
1446        }
1447
1448        pn = (struct tnode *) n;
1449        chopped_off = 0;
1450
1451        while (pn) {
1452                pos = pn->pos;
1453                bits = pn->bits;
1454
1455                if (!chopped_off)
1456                        cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1457                                                   pos, bits);
1458
1459                n = tnode_get_child_rcu(pn, cindex);
1460
1461                if (n == NULL) {
1462#ifdef CONFIG_IP_FIB_TRIE_STATS
1463                        t->stats.null_node_hit++;
1464#endif
1465                        goto backtrace;
1466                }
1467
1468                if (IS_LEAF(n)) {
1469                        ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1470                        if (ret > 0)
1471                                goto backtrace;
1472                        goto found;
1473                }
1474
1475                cn = (struct tnode *)n;
1476
1477                /*
1478                 * It's a tnode, and we can do some extra checks here if we
1479                 * like, to avoid descending into a dead-end branch.
1480                 * This tnode is in the parent's child array at index
1481                 * key[p_pos..p_pos+p_bits] but potentially with some bits
1482                 * chopped off, so in reality the index may be just a
1483                 * subprefix, padded with zero at the end.
1484                 * We can also take a look at any skipped bits in this
1485                 * tnode - everything up to p_pos is supposed to be ok,
1486                 * and the non-chopped bits of the index (se previous
1487                 * paragraph) are also guaranteed ok, but the rest is
1488                 * considered unknown.
1489                 *
1490                 * The skipped bits are key[pos+bits..cn->pos].
1491                 */
1492
1493                /* If current_prefix_length < pos+bits, we are already doing
1494                 * actual prefix  matching, which means everything from
1495                 * pos+(bits-chopped_off) onward must be zero along some
1496                 * branch of this subtree - otherwise there is *no* valid
1497                 * prefix present. Here we can only check the skipped
1498                 * bits. Remember, since we have already indexed into the
1499                 * parent's child array, we know that the bits we chopped of
1500                 * *are* zero.
1501                 */
1502
1503                /* NOTA BENE: Checking only skipped bits
1504                   for the new node here */
1505
1506                if (current_prefix_length < pos+bits) {
1507                        if (tkey_extract_bits(cn->key, current_prefix_length,
1508                                                cn->pos - current_prefix_length)
1509                            || !(cn->child[0]))
1510                                goto backtrace;
1511                }
1512
1513                /*
1514                 * If chopped_off=0, the index is fully validated and we
1515                 * only need to look at the skipped bits for this, the new,
1516                 * tnode. What we actually want to do is to find out if
1517                 * these skipped bits match our key perfectly, or if we will
1518                 * have to count on finding a matching prefix further down,
1519                 * because if we do, we would like to have some way of
1520                 * verifying the existence of such a prefix at this point.
1521                 */
1522
1523                /* The only thing we can do at this point is to verify that
1524                 * any such matching prefix can indeed be a prefix to our
1525                 * key, and if the bits in the node we are inspecting that
1526                 * do not match our key are not ZERO, this cannot be true.
1527                 * Thus, find out where there is a mismatch (before cn->pos)
1528                 * and verify that all the mismatching bits are zero in the
1529                 * new tnode's key.
1530                 */
1531
1532                /*
1533                 * Note: We aren't very concerned about the piece of
1534                 * the key that precede pn->pos+pn->bits, since these
1535                 * have already been checked. The bits after cn->pos
1536                 * aren't checked since these are by definition
1537                 * "unknown" at this point. Thus, what we want to see
1538                 * is if we are about to enter the "prefix matching"
1539                 * state, and in that case verify that the skipped
1540                 * bits that will prevail throughout this subtree are
1541                 * zero, as they have to be if we are to find a
1542                 * matching prefix.
1543                 */
1544
1545                pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
1546
1547                /*
1548                 * In short: If skipped bits in this node do not match
1549                 * the search key, enter the "prefix matching"
1550                 * state.directly.
1551                 */
1552                if (pref_mismatch) {
1553                        /* fls(x) = __fls(x) + 1 */
1554                        int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
1555
1556                        if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
1557                                goto backtrace;
1558
1559                        if (current_prefix_length >= cn->pos)
1560                                current_prefix_length = mp;
1561                }
1562
1563                pn = (struct tnode *)n; /* Descend */
1564                chopped_off = 0;
1565                continue;
1566
1567backtrace:
1568                chopped_off++;
1569
1570                /* As zero don't change the child key (cindex) */
1571                while ((chopped_off <= pn->bits)
1572                       && !(cindex & (1<<(chopped_off-1))))
1573                        chopped_off++;
1574
1575                /* Decrease current_... with bits chopped off */
1576                if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1577                        current_prefix_length = pn->pos + pn->bits
1578                                - chopped_off;
1579
1580                /*
1581                 * Either we do the actual chop off according or if we have
1582                 * chopped off all bits in this tnode walk up to our parent.
1583                 */
1584
1585                if (chopped_off <= pn->bits) {
1586                        cindex &= ~(1 << (chopped_off-1));
1587                } else {
1588                        struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
1589                        if (!parent)
1590                                goto failed;
1591
1592                        /* Get Child's index */
1593                        cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1594                        pn = parent;
1595                        chopped_off = 0;
1596
1597#ifdef CONFIG_IP_FIB_TRIE_STATS
1598                        t->stats.backtrack++;
1599#endif
1600                        goto backtrace;
1601                }
1602        }
1603failed:
1604        ret = 1;
1605found:
1606        rcu_read_unlock();
1607        return ret;
1608}
1609EXPORT_SYMBOL_GPL(fib_table_lookup);
1610
1611/*
1612 * Remove the leaf and return parent.
1613 */
1614static void trie_leaf_remove(struct trie *t, struct leaf *l)
1615{
1616        struct tnode *tp = node_parent((struct rt_trie_node *) l);
1617
1618        pr_debug("entering trie_leaf_remove(%p)\n", l);
1619
1620        if (tp) {
1621                t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1622                put_child(tp, cindex, NULL);
1623                trie_rebalance(t, tp);
1624        } else
1625                RCU_INIT_POINTER(t->trie, NULL);
1626
1627        free_leaf(l);
1628}
1629
1630/*
1631 * Caller must hold RTNL.
1632 */
1633int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1634{
1635        struct trie *t = (struct trie *) tb->tb_data;
1636        u32 key, mask;
1637        int plen = cfg->fc_dst_len;
1638        u8 tos = cfg->fc_tos;
1639        struct fib_alias *fa, *fa_to_delete;
1640        struct list_head *fa_head;
1641        struct leaf *l;
1642        struct leaf_info *li;
1643
1644        if (plen > 32)
1645                return -EINVAL;
1646
1647        key = ntohl(cfg->fc_dst);
1648        mask = ntohl(inet_make_mask(plen));
1649
1650        if (key & ~mask)
1651                return -EINVAL;
1652
1653        key = key & mask;
1654        l = fib_find_node(t, key);
1655
1656        if (!l)
1657                return -ESRCH;
1658
1659        li = find_leaf_info(l, plen);
1660
1661        if (!li)
1662                return -ESRCH;
1663
1664        fa_head = &li->falh;
1665        fa = fib_find_alias(fa_head, tos, 0);
1666
1667        if (!fa)
1668                return -ESRCH;
1669
1670        pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1671
1672        fa_to_delete = NULL;
1673        fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1674        list_for_each_entry_continue(fa, fa_head, fa_list) {
1675                struct fib_info *fi = fa->fa_info;
1676
1677                if (fa->fa_tos != tos)
1678                        break;
1679
1680                if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1681                    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1682                     fa->fa_info->fib_scope == cfg->fc_scope) &&
1683                    (!cfg->fc_prefsrc ||
1684                     fi->fib_prefsrc == cfg->fc_prefsrc) &&
1685                    (!cfg->fc_protocol ||
1686                     fi->fib_protocol == cfg->fc_protocol) &&
1687                    fib_nh_match(cfg, fi) == 0) {
1688                        fa_to_delete = fa;
1689                        break;
1690                }
1691        }
1692
1693        if (!fa_to_delete)
1694                return -ESRCH;
1695
1696        fa = fa_to_delete;
1697        rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1698                  &cfg->fc_nlinfo, 0);
1699
1700        list_del_rcu(&fa->fa_list);
1701
1702        if (!plen)
1703                tb->tb_num_default--;
1704
1705        if (list_empty(fa_head)) {
1706                hlist_del_rcu(&li->hlist);
1707                free_leaf_info(li);
1708        }
1709
1710        if (hlist_empty(&l->list))
1711                trie_leaf_remove(t, l);
1712
1713        if (fa->fa_state & FA_S_ACCESSED)
1714                rt_cache_flush(cfg->fc_nlinfo.nl_net);
1715
1716        fib_release_info(fa->fa_info);
1717        alias_free_mem_rcu(fa);
1718        return 0;
1719}
1720
1721static int trie_flush_list(struct list_head *head)
1722{
1723        struct fib_alias *fa, *fa_node;
1724        int found = 0;
1725
1726        list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1727                struct fib_info *fi = fa->fa_info;
1728
1729                if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1730                        list_del_rcu(&fa->fa_list);
1731                        fib_release_info(fa->fa_info);
1732                        alias_free_mem_rcu(fa);
1733                        found++;
1734                }
1735        }
1736        return found;
1737}
1738
1739static int trie_flush_leaf(struct leaf *l)
1740{
1741        int found = 0;
1742        struct hlist_head *lih = &l->list;
1743        struct hlist_node *node, *tmp;
1744        struct leaf_info *li = NULL;
1745
1746        hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1747                found += trie_flush_list(&li->falh);
1748
1749                if (list_empty(&li->falh)) {
1750                        hlist_del_rcu(&li->hlist);
1751                        free_leaf_info(li);
1752                }
1753        }
1754        return found;
1755}
1756
1757/*
1758 * Scan for the next right leaf starting at node p->child[idx]
1759 * Since we have back pointer, no recursion necessary.
1760 */
1761static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
1762{
1763        do {
1764                t_key idx;
1765
1766                if (c)
1767                        idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1768                else
1769                        idx = 0;
1770
1771                while (idx < 1u << p->bits) {
1772                        c = tnode_get_child_rcu(p, idx++);
1773                        if (!c)
1774                                continue;
1775
1776                        if (IS_LEAF(c)) {
1777                                prefetch(rcu_dereference_rtnl(p->child[idx]));
1778                                return (struct leaf *) c;
1779                        }
1780
1781                        /* Rescan start scanning in new node */
1782                        p = (struct tnode *) c;
1783                        idx = 0;
1784                }
1785
1786                /* Node empty, walk back up to parent */
1787                c = (struct rt_trie_node *) p;
1788        } while ((p = node_parent_rcu(c)) != NULL);
1789
1790        return NULL; /* Root of trie */
1791}
1792
1793static struct leaf *trie_firstleaf(struct trie *t)
1794{
1795        struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
1796
1797        if (!n)
1798                return NULL;
1799
1800        if (IS_LEAF(n))          /* trie is just a leaf */
1801                return (struct leaf *) n;
1802
1803        return leaf_walk_rcu(n, NULL);
1804}
1805
1806static struct leaf *trie_nextleaf(struct leaf *l)
1807{
1808        struct rt_trie_node *c = (struct rt_trie_node *) l;
1809        struct tnode *p = node_parent_rcu(c);
1810
1811        if (!p)
1812                return NULL;    /* trie with just one leaf */
1813
1814        return leaf_walk_rcu(p, c);
1815}
1816
1817static struct leaf *trie_leafindex(struct trie *t, int index)
1818{
1819        struct leaf *l = trie_firstleaf(t);
1820
1821        while (l && index-- > 0)
1822                l = trie_nextleaf(l);
1823
1824        return l;
1825}
1826
1827
1828/*
1829 * Caller must hold RTNL.
1830 */
1831int fib_table_flush(struct fib_table *tb)
1832{
1833        struct trie *t = (struct trie *) tb->tb_data;
1834        struct leaf *l, *ll = NULL;
1835        int found = 0;
1836
1837        for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1838                found += trie_flush_leaf(l);
1839
1840                if (ll && hlist_empty(&ll->list))
1841                        trie_leaf_remove(t, ll);
1842                ll = l;
1843        }
1844
1845        if (ll && hlist_empty(&ll->list))
1846                trie_leaf_remove(t, ll);
1847
1848        pr_debug("trie_flush found=%d\n", found);
1849        return found;
1850}
1851
1852void fib_free_table(struct fib_table *tb)
1853{
1854        kfree(tb);
1855}
1856
1857static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1858                           struct fib_table *tb,
1859                           struct sk_buff *skb, struct netlink_callback *cb)
1860{
1861        int i, s_i;
1862        struct fib_alias *fa;
1863        __be32 xkey = htonl(key);
1864
1865        s_i = cb->args[5];
1866        i = 0;
1867
1868        /* rcu_read_lock is hold by caller */
1869
1870        list_for_each_entry_rcu(fa, fah, fa_list) {
1871                if (i < s_i) {
1872                        i++;
1873                        continue;
1874                }
1875
1876                if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
1877                                  cb->nlh->nlmsg_seq,
1878                                  RTM_NEWROUTE,
1879                                  tb->tb_id,
1880                                  fa->fa_type,
1881                                  xkey,
1882                                  plen,
1883                                  fa->fa_tos,
1884                                  fa->fa_info, NLM_F_MULTI) < 0) {
1885                        cb->args[5] = i;
1886                        return -1;
1887                }
1888                i++;
1889        }
1890        cb->args[5] = i;
1891        return skb->len;
1892}
1893
1894static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1895                        struct sk_buff *skb, struct netlink_callback *cb)
1896{
1897        struct leaf_info *li;
1898        struct hlist_node *node;
1899        int i, s_i;
1900
1901        s_i = cb->args[4];
1902        i = 0;
1903
1904        /* rcu_read_lock is hold by caller */
1905        hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1906                if (i < s_i) {
1907                        i++;
1908                        continue;
1909                }
1910
1911                if (i > s_i)
1912                        cb->args[5] = 0;
1913
1914                if (list_empty(&li->falh))
1915                        continue;
1916
1917                if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1918                        cb->args[4] = i;
1919                        return -1;
1920                }
1921                i++;
1922        }
1923
1924        cb->args[4] = i;
1925        return skb->len;
1926}
1927
1928int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1929                   struct netlink_callback *cb)
1930{
1931        struct leaf *l;
1932        struct trie *t = (struct trie *) tb->tb_data;
1933        t_key key = cb->args[2];
1934        int count = cb->args[3];
1935
1936        rcu_read_lock();
1937        /* Dump starting at last key.
1938         * Note: 0.0.0.0/0 (ie default) is first key.
1939         */
1940        if (count == 0)
1941                l = trie_firstleaf(t);
1942        else {
1943                /* Normally, continue from last key, but if that is missing
1944                 * fallback to using slow rescan
1945                 */
1946                l = fib_find_node(t, key);
1947                if (!l)
1948                        l = trie_leafindex(t, count);
1949        }
1950
1951        while (l) {
1952                cb->args[2] = l->key;
1953                if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1954                        cb->args[3] = count;
1955                        rcu_read_unlock();
1956                        return -1;
1957                }
1958
1959                ++count;
1960                l = trie_nextleaf(l);
1961                memset(&cb->args[4], 0,
1962                       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1963        }
1964        cb->args[3] = count;
1965        rcu_read_unlock();
1966
1967        return skb->len;
1968}
1969
1970void __init fib_trie_init(void)
1971{
1972        fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1973                                          sizeof(struct fib_alias),
1974                                          0, SLAB_PANIC, NULL);
1975
1976        trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1977                                           max(sizeof(struct leaf),
1978                                               sizeof(struct leaf_info)),
1979                                           0, SLAB_PANIC, NULL);
1980}
1981
1982
1983struct fib_table *fib_trie_table(u32 id)
1984{
1985        struct fib_table *tb;
1986        struct trie *t;
1987
1988        tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1989                     GFP_KERNEL);
1990        if (tb == NULL)
1991                return NULL;
1992
1993        tb->tb_id = id;
1994        tb->tb_default = -1;
1995        tb->tb_num_default = 0;
1996
1997        t = (struct trie *) tb->tb_data;
1998        memset(t, 0, sizeof(*t));
1999
2000        return tb;
2001}
2002
2003#ifdef CONFIG_PROC_FS
2004/* Depth first Trie walk iterator */
2005struct fib_trie_iter {
2006        struct seq_net_private p;
2007        struct fib_table *tb;
2008        struct tnode *tnode;
2009        unsigned int index;
2010        unsigned int depth;
2011};
2012
2013static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
2014{
2015        struct tnode *tn = iter->tnode;
2016        unsigned int cindex = iter->index;
2017        struct tnode *p;
2018
2019        /* A single entry routing table */
2020        if (!tn)
2021                return NULL;
2022
2023        pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2024                 iter->tnode, iter->index, iter->depth);
2025rescan:
2026        while (cindex < (1<<tn->bits)) {
2027                struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
2028
2029                if (n) {
2030                        if (IS_LEAF(n)) {
2031                                iter->tnode = tn;
2032                                iter->index = cindex + 1;
2033                        } else {
2034                                /* push down one level */
2035                                iter->tnode = (struct tnode *) n;
2036                                iter->index = 0;
2037                                ++iter->depth;
2038                        }
2039                        return n;
2040                }
2041
2042                ++cindex;
2043        }
2044
2045        /* Current node exhausted, pop back up */
2046        p = node_parent_rcu((struct rt_trie_node *)tn);
2047        if (p) {
2048                cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2049                tn = p;
2050                --iter->depth;
2051                goto rescan;
2052        }
2053
2054        /* got root? */
2055        return NULL;
2056}
2057
2058static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2059                                       struct trie *t)
2060{
2061        struct rt_trie_node *n;
2062
2063        if (!t)
2064                return NULL;
2065
2066        n = rcu_dereference(t->trie);
2067        if (!n)
2068                return NULL;
2069
2070        if (IS_TNODE(n)) {
2071                iter->tnode = (struct tnode *) n;
2072                iter->index = 0;
2073                iter->depth = 1;
2074        } else {
2075                iter->tnode = NULL;
2076                iter->index = 0;
2077                iter->depth = 0;
2078        }
2079
2080        return n;
2081}
2082
2083static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2084{
2085        struct rt_trie_node *n;
2086        struct fib_trie_iter iter;
2087
2088        memset(s, 0, sizeof(*s));
2089
2090        rcu_read_lock();
2091        for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2092                if (IS_LEAF(n)) {
2093                        struct leaf *l = (struct leaf *)n;
2094                        struct leaf_info *li;
2095                        struct hlist_node *tmp;
2096
2097                        s->leaves++;
2098                        s->totdepth += iter.depth;
2099                        if (iter.depth > s->maxdepth)
2100                                s->maxdepth = iter.depth;
2101
2102                        hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2103                                ++s->prefixes;
2104                } else {
2105                        const struct tnode *tn = (const struct tnode *) n;
2106                        int i;
2107
2108                        s->tnodes++;
2109                        if (tn->bits < MAX_STAT_DEPTH)
2110                                s->nodesizes[tn->bits]++;
2111
2112                        for (i = 0; i < (1<<tn->bits); i++)
2113                                if (!tn->child[i])
2114                                        s->nullpointers++;
2115                }
2116        }
2117        rcu_read_unlock();
2118}
2119
2120/*
2121 *      This outputs /proc/net/fib_triestats
2122 */
2123static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2124{
2125        unsigned int i, max, pointers, bytes, avdepth;
2126
2127        if (stat->leaves)
2128                avdepth = stat->totdepth*100 / stat->leaves;
2129        else
2130                avdepth = 0;
2131
2132        seq_printf(seq, "\tAver depth:     %u.%02d\n",
2133                   avdepth / 100, avdepth % 100);
2134        seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2135
2136        seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2137        bytes = sizeof(struct leaf) * stat->leaves;
2138
2139        seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2140        bytes += sizeof(struct leaf_info) * stat->prefixes;
2141
2142        seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2143        bytes += sizeof(struct tnode) * stat->tnodes;
2144
2145        max = MAX_STAT_DEPTH;
2146        while (max > 0 && stat->nodesizes[max-1] == 0)
2147                max--;
2148
2149        pointers = 0;
2150        for (i = 1; i <= max; i++)
2151                if (stat->nodesizes[i] != 0) {
2152                        seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2153                        pointers += (1<<i) * stat->nodesizes[i];
2154                }
2155        seq_putc(seq, '\n');
2156        seq_printf(seq, "\tPointers: %u\n", pointers);
2157
2158        bytes += sizeof(struct rt_trie_node *) * pointers;
2159        seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2160        seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2161}
2162
2163#ifdef CONFIG_IP_FIB_TRIE_STATS
2164static void trie_show_usage(struct seq_file *seq,
2165                            const struct trie_use_stats *stats)
2166{
2167        seq_printf(seq, "\nCounters:\n---------\n");
2168        seq_printf(seq, "gets = %u\n", stats->gets);
2169        seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2170        seq_printf(seq, "semantic match passed = %u\n",
2171                   stats->semantic_match_passed);
2172        seq_printf(seq, "semantic match miss = %u\n",
2173                   stats->semantic_match_miss);
2174        seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2175        seq_printf(seq, "skipped node resize = %u\n\n",
2176                   stats->resize_node_skipped);
2177}
2178#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2179
2180static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2181{
2182        if (tb->tb_id == RT_TABLE_LOCAL)
2183                seq_puts(seq, "Local:\n");
2184        else if (tb->tb_id == RT_TABLE_MAIN)
2185                seq_puts(seq, "Main:\n");
2186        else
2187                seq_printf(seq, "Id %d:\n", tb->tb_id);
2188}
2189
2190
2191static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2192{
2193        struct net *net = (struct net *)seq->private;
2194        unsigned int h;
2195
2196        seq_printf(seq,
2197                   "Basic info: size of leaf:"
2198                   " %Zd bytes, size of tnode: %Zd bytes.\n",
2199                   sizeof(struct leaf), sizeof(struct tnode));
2200
2201        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2202                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2203                struct hlist_node *node;
2204                struct fib_table *tb;
2205
2206                hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2207                        struct trie *t = (struct trie *) tb->tb_data;
2208                        struct trie_stat stat;
2209
2210                        if (!t)
2211                                continue;
2212
2213                        fib_table_print(seq, tb);
2214
2215                        trie_collect_stats(t, &stat);
2216                        trie_show_stats(seq, &stat);
2217#ifdef CONFIG_IP_FIB_TRIE_STATS
2218                        trie_show_usage(seq, &t->stats);
2219#endif
2220                }
2221        }
2222
2223        return 0;
2224}
2225
2226static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2227{
2228        return single_open_net(inode, file, fib_triestat_seq_show);
2229}
2230
2231static const struct file_operations fib_triestat_fops = {
2232        .owner  = THIS_MODULE,
2233        .open   = fib_triestat_seq_open,
2234        .read   = seq_read,
2235        .llseek = seq_lseek,
2236        .release = single_release_net,
2237};
2238
2239static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2240{
2241        struct fib_trie_iter *iter = seq->private;
2242        struct net *net = seq_file_net(seq);
2243        loff_t idx = 0;
2244        unsigned int h;
2245
2246        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2247                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2248                struct hlist_node *node;
2249                struct fib_table *tb;
2250
2251                hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2252                        struct rt_trie_node *n;
2253
2254                        for (n = fib_trie_get_first(iter,
2255                                                    (struct trie *) tb->tb_data);
2256                             n; n = fib_trie_get_next(iter))
2257                                if (pos == idx++) {
2258                                        iter->tb = tb;
2259                                        return n;
2260                                }
2261                }
2262        }
2263
2264        return NULL;
2265}
2266
2267static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2268        __acquires(RCU)
2269{
2270        rcu_read_lock();
2271        return fib_trie_get_idx(seq, *pos);
2272}
2273
2274static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2275{
2276        struct fib_trie_iter *iter = seq->private;
2277        struct net *net = seq_file_net(seq);
2278        struct fib_table *tb = iter->tb;
2279        struct hlist_node *tb_node;
2280        unsigned int h;
2281        struct rt_trie_node *n;
2282
2283        ++*pos;
2284        /* next node in same table */
2285        n = fib_trie_get_next(iter);
2286        if (n)
2287                return n;
2288
2289        /* walk rest of this hash chain */
2290        h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2291        while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2292                tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2293                n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2294                if (n)
2295                        goto found;
2296        }
2297
2298        /* new hash chain */
2299        while (++h < FIB_TABLE_HASHSZ) {
2300                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2301                hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2302                        n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2303                        if (n)
2304                                goto found;
2305                }
2306        }
2307        return NULL;
2308
2309found:
2310        iter->tb = tb;
2311        return n;
2312}
2313
2314static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2315        __releases(RCU)
2316{
2317        rcu_read_unlock();
2318}
2319
2320static void seq_indent(struct seq_file *seq, int n)
2321{
2322        while (n-- > 0)
2323                seq_puts(seq, "   ");
2324}
2325
2326static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2327{
2328        switch (s) {
2329        case RT_SCOPE_UNIVERSE: return "universe";
2330        case RT_SCOPE_SITE:     return "site";
2331        case RT_SCOPE_LINK:     return "link";
2332        case RT_SCOPE_HOST:     return "host";
2333        case RT_SCOPE_NOWHERE:  return "nowhere";
2334        default:
2335                snprintf(buf, len, "scope=%d", s);
2336                return buf;
2337        }
2338}
2339
2340static const char *const rtn_type_names[__RTN_MAX] = {
2341        [RTN_UNSPEC] = "UNSPEC",
2342        [RTN_UNICAST] = "UNICAST",
2343        [RTN_LOCAL] = "LOCAL",
2344        [RTN_BROADCAST] = "BROADCAST",
2345        [RTN_ANYCAST] = "ANYCAST",
2346        [RTN_MULTICAST] = "MULTICAST",
2347        [RTN_BLACKHOLE] = "BLACKHOLE",
2348        [RTN_UNREACHABLE] = "UNREACHABLE",
2349        [RTN_PROHIBIT] = "PROHIBIT",
2350        [RTN_THROW] = "THROW",
2351        [RTN_NAT] = "NAT",
2352        [RTN_XRESOLVE] = "XRESOLVE",
2353};
2354
2355static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2356{
2357        if (t < __RTN_MAX && rtn_type_names[t])
2358                return rtn_type_names[t];
2359        snprintf(buf, len, "type %u", t);
2360        return buf;
2361}
2362
2363/* Pretty print the trie */
2364static int fib_trie_seq_show(struct seq_file *seq, void *v)
2365{
2366        const struct fib_trie_iter *iter = seq->private;
2367        struct rt_trie_node *n = v;
2368
2369        if (!node_parent_rcu(n))
2370                fib_table_print(seq, iter->tb);
2371
2372        if (IS_TNODE(n)) {
2373                struct tnode *tn = (struct tnode *) n;
2374                __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2375
2376                seq_indent(seq, iter->depth-1);
2377                seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
2378                           &prf, tn->pos, tn->bits, tn->full_children,
2379                           tn->empty_children);
2380
2381        } else {
2382                struct leaf *l = (struct leaf *) n;
2383                struct leaf_info *li;
2384                struct hlist_node *node;
2385                __be32 val = htonl(l->key);
2386
2387                seq_indent(seq, iter->depth);
2388                seq_printf(seq, "  |-- %pI4\n", &val);
2389
2390                hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2391                        struct fib_alias *fa;
2392
2393                        list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2394                                char buf1[32], buf2[32];
2395
2396                                seq_indent(seq, iter->depth+1);
2397                                seq_printf(seq, "  /%d %s %s", li->plen,
2398                                           rtn_scope(buf1, sizeof(buf1),
2399                                                     fa->fa_info->fib_scope),
2400                                           rtn_type(buf2, sizeof(buf2),
2401                                                    fa->fa_type));
2402                                if (fa->fa_tos)
2403                                        seq_printf(seq, " tos=%d", fa->fa_tos);
2404                                seq_putc(seq, '\n');
2405                        }
2406                }
2407        }
2408
2409        return 0;
2410}
2411
2412static const struct seq_operations fib_trie_seq_ops = {
2413        .start  = fib_trie_seq_start,
2414        .next   = fib_trie_seq_next,
2415        .stop   = fib_trie_seq_stop,
2416        .show   = fib_trie_seq_show,
2417};
2418
2419static int fib_trie_seq_open(struct inode *inode, struct file *file)
2420{
2421        return seq_open_net(inode, file, &fib_trie_seq_ops,
2422                            sizeof(struct fib_trie_iter));
2423}
2424
2425static const struct file_operations fib_trie_fops = {
2426        .owner  = THIS_MODULE,
2427        .open   = fib_trie_seq_open,
2428        .read   = seq_read,
2429        .llseek = seq_lseek,
2430        .release = seq_release_net,
2431};
2432
2433struct fib_route_iter {
2434        struct seq_net_private p;
2435        struct trie *main_trie;
2436        loff_t  pos;
2437        t_key   key;
2438};
2439
2440static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2441{
2442        struct leaf *l = NULL;
2443        struct trie *t = iter->main_trie;
2444
2445        /* use cache location of last found key */
2446        if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2447                pos -= iter->pos;
2448        else {
2449                iter->pos = 0;
2450                l = trie_firstleaf(t);
2451        }
2452
2453        while (l && pos-- > 0) {
2454                iter->pos++;
2455                l = trie_nextleaf(l);
2456        }
2457
2458        if (l)
2459                iter->key = pos;        /* remember it */
2460        else
2461                iter->pos = 0;          /* forget it */
2462
2463        return l;
2464}
2465
2466static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2467        __acquires(RCU)
2468{
2469        struct fib_route_iter *iter = seq->private;
2470        struct fib_table *tb;
2471
2472        rcu_read_lock();
2473        tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2474        if (!tb)
2475                return NULL;
2476
2477        iter->main_trie = (struct trie *) tb->tb_data;
2478        if (*pos == 0)
2479                return SEQ_START_TOKEN;
2480        else
2481                return fib_route_get_idx(iter, *pos - 1);
2482}
2483
2484static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2485{
2486        struct fib_route_iter *iter = seq->private;
2487        struct leaf *l = v;
2488
2489        ++*pos;
2490        if (v == SEQ_START_TOKEN) {
2491                iter->pos = 0;
2492                l = trie_firstleaf(iter->main_trie);
2493        } else {
2494                iter->pos++;
2495                l = trie_nextleaf(l);
2496        }
2497
2498        if (l)
2499                iter->key = l->key;
2500        else
2501                iter->pos = 0;
2502        return l;
2503}
2504
2505static void fib_route_seq_stop(struct seq_file *seq, void *v)
2506        __releases(RCU)
2507{
2508        rcu_read_unlock();
2509}
2510
2511static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2512{
2513        unsigned int flags = 0;
2514
2515        if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2516                flags = RTF_REJECT;
2517        if (fi && fi->fib_nh->nh_gw)
2518                flags |= RTF_GATEWAY;
2519        if (mask == htonl(0xFFFFFFFF))
2520                flags |= RTF_HOST;
2521        flags |= RTF_UP;
2522        return flags;
2523}
2524
2525/*
2526 *      This outputs /proc/net/route.
2527 *      The format of the file is not supposed to be changed
2528 *      and needs to be same as fib_hash output to avoid breaking
2529 *      legacy utilities
2530 */
2531static int fib_route_seq_show(struct seq_file *seq, void *v)
2532{
2533        struct leaf *l = v;
2534        struct leaf_info *li;
2535        struct hlist_node *node;
2536
2537        if (v == SEQ_START_TOKEN) {
2538                seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2539                           "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2540                           "\tWindow\tIRTT");
2541                return 0;
2542        }
2543
2544        hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2545                struct fib_alias *fa;
2546                __be32 mask, prefix;
2547
2548                mask = inet_make_mask(li->plen);
2549                prefix = htonl(l->key);
2550
2551                list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2552                        const struct fib_info *fi = fa->fa_info;
2553                        unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2554                        int len;
2555
2556                        if (fa->fa_type == RTN_BROADCAST
2557                            || fa->fa_type == RTN_MULTICAST)
2558                                continue;
2559
2560                        if (fi)
2561                                seq_printf(seq,
2562                                         "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2563                                         "%d\t%08X\t%d\t%u\t%u%n",
2564                                         fi->fib_dev ? fi->fib_dev->name : "*",
2565                                         prefix,
2566                                         fi->fib_nh->nh_gw, flags, 0, 0,
2567                                         fi->fib_priority,
2568                                         mask,
2569                                         (fi->fib_advmss ?
2570                                          fi->fib_advmss + 40 : 0),
2571                                         fi->fib_window,
2572                                         fi->fib_rtt >> 3, &len);
2573                        else
2574                                seq_printf(seq,
2575                                         "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2576                                         "%d\t%08X\t%d\t%u\t%u%n",
2577                                         prefix, 0, flags, 0, 0, 0,
2578                                         mask, 0, 0, 0, &len);
2579
2580                        seq_printf(seq, "%*s\n", 127 - len, "");
2581                }
2582        }
2583
2584        return 0;
2585}
2586
2587static const struct seq_operations fib_route_seq_ops = {
2588        .start  = fib_route_seq_start,
2589        .next   = fib_route_seq_next,
2590        .stop   = fib_route_seq_stop,
2591        .show   = fib_route_seq_show,
2592};
2593
2594static int fib_route_seq_open(struct inode *inode, struct file *file)
2595{
2596        return seq_open_net(inode, file, &fib_route_seq_ops,
2597                            sizeof(struct fib_route_iter));
2598}
2599
2600static const struct file_operations fib_route_fops = {
2601        .owner  = THIS_MODULE,
2602        .open   = fib_route_seq_open,
2603        .read   = seq_read,
2604        .llseek = seq_lseek,
2605        .release = seq_release_net,
2606};
2607
2608int __net_init fib_proc_init(struct net *net)
2609{
2610        if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2611                goto out1;
2612
2613        if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2614                                  &fib_triestat_fops))
2615                goto out2;
2616
2617        if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2618                goto out3;
2619
2620        return 0;
2621
2622out3:
2623        proc_net_remove(net, "fib_triestat");
2624out2:
2625        proc_net_remove(net, "fib_trie");
2626out1:
2627        return -ENOMEM;
2628}
2629
2630void __net_exit fib_proc_exit(struct net *net)
2631{
2632        proc_net_remove(net, "fib_trie");
2633        proc_net_remove(net, "fib_triestat");
2634        proc_net_remove(net, "route");
2635}
2636
2637#endif /* CONFIG_PROC_FS */
2638
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.