linux/net/ipv4/fib_trie.c
<<
>>
Prefs
   1/*
   2 *   This program is free software; you can redistribute it and/or
   3 *   modify it under the terms of the GNU General Public License
   4 *   as published by the Free Software Foundation; either version
   5 *   2 of the License, or (at your option) any later version.
   6 *
   7 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
   8 *     & Swedish University of Agricultural Sciences.
   9 *
  10 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
  11 *     Agricultural Sciences.
  12 *
  13 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
  14 *
  15 * This work is based on the LPC-trie which is originally described in:
  16 *
  17 * An experimental study of compression methods for dynamic tries
  18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
  19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
  20 *
  21 *
  22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
  23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
  24 *
  25 *
  26 * Code from fib_hash has been reused which includes the following header:
  27 *
  28 *
  29 * INET         An implementation of the TCP/IP protocol suite for the LINUX
  30 *              operating system.  INET is implemented using the  BSD Socket
  31 *              interface as the means of communication with the user level.
  32 *
  33 *              IPv4 FIB: lookup engine and maintenance routines.
  34 *
  35 *
  36 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  37 *
  38 *              This program is free software; you can redistribute it and/or
  39 *              modify it under the terms of the GNU General Public License
  40 *              as published by the Free Software Foundation; either version
  41 *              2 of the License, or (at your option) any later version.
  42 *
  43 * Substantial contributions to this work comes from:
  44 *
  45 *              David S. Miller, <davem@davemloft.net>
  46 *              Stephen Hemminger <shemminger@osdl.org>
  47 *              Paul E. McKenney <paulmck@us.ibm.com>
  48 *              Patrick McHardy <kaber@trash.net>
  49 */
  50
  51#define VERSION "0.409"
  52
  53#include <asm/uaccess.h>
  54#include <linux/bitops.h>
  55#include <linux/types.h>
  56#include <linux/kernel.h>
  57#include <linux/mm.h>
  58#include <linux/string.h>
  59#include <linux/socket.h>
  60#include <linux/sockios.h>
  61#include <linux/errno.h>
  62#include <linux/in.h>
  63#include <linux/inet.h>
  64#include <linux/inetdevice.h>
  65#include <linux/netdevice.h>
  66#include <linux/if_arp.h>
  67#include <linux/proc_fs.h>
  68#include <linux/rcupdate.h>
  69#include <linux/skbuff.h>
  70#include <linux/netlink.h>
  71#include <linux/init.h>
  72#include <linux/list.h>
  73#include <linux/slab.h>
  74#include <linux/prefetch.h>
  75#include <linux/export.h>
  76#include <net/net_namespace.h>
  77#include <net/ip.h>
  78#include <net/protocol.h>
  79#include <net/route.h>
  80#include <net/tcp.h>
  81#include <net/sock.h>
  82#include <net/ip_fib.h>
  83#include "fib_lookup.h"
  84
  85#define MAX_STAT_DEPTH 32
  86
  87#define KEYLENGTH (8*sizeof(t_key))
  88
  89typedef unsigned int t_key;
  90
  91#define T_TNODE 0
  92#define T_LEAF  1
  93#define NODE_TYPE_MASK  0x1UL
  94#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
  95
  96#define IS_TNODE(n) (!(n->parent & T_LEAF))
  97#define IS_LEAF(n) (n->parent & T_LEAF)
  98
  99struct rt_trie_node {
 100        unsigned long parent;
 101        t_key key;
 102};
 103
 104struct leaf {
 105        unsigned long parent;
 106        t_key key;
 107        struct hlist_head list;
 108        struct rcu_head rcu;
 109};
 110
 111struct leaf_info {
 112        struct hlist_node hlist;
 113        int plen;
 114        u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
 115        struct list_head falh;
 116        struct rcu_head rcu;
 117};
 118
 119struct tnode {
 120        unsigned long parent;
 121        t_key key;
 122        unsigned char pos;              /* 2log(KEYLENGTH) bits needed */
 123        unsigned char bits;             /* 2log(KEYLENGTH) bits needed */
 124        unsigned int full_children;     /* KEYLENGTH bits needed */
 125        unsigned int empty_children;    /* KEYLENGTH bits needed */
 126        union {
 127                struct rcu_head rcu;
 128                struct work_struct work;
 129                struct tnode *tnode_free;
 130        };
 131        struct rt_trie_node __rcu *child[0];
 132};
 133
 134#ifdef CONFIG_IP_FIB_TRIE_STATS
 135struct trie_use_stats {
 136        unsigned int gets;
 137        unsigned int backtrack;
 138        unsigned int semantic_match_passed;
 139        unsigned int semantic_match_miss;
 140        unsigned int null_node_hit;
 141        unsigned int resize_node_skipped;
 142};
 143#endif
 144
 145struct trie_stat {
 146        unsigned int totdepth;
 147        unsigned int maxdepth;
 148        unsigned int tnodes;
 149        unsigned int leaves;
 150        unsigned int nullpointers;
 151        unsigned int prefixes;
 152        unsigned int nodesizes[MAX_STAT_DEPTH];
 153};
 154
 155struct trie {
 156        struct rt_trie_node __rcu *trie;
 157#ifdef CONFIG_IP_FIB_TRIE_STATS
 158        struct trie_use_stats stats;
 159#endif
 160};
 161
 162static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
 163                                  int wasfull);
 164static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
 165static struct tnode *inflate(struct trie *t, struct tnode *tn);
 166static struct tnode *halve(struct trie *t, struct tnode *tn);
 167/* tnodes to free after resize(); protected by RTNL */
 168static struct tnode *tnode_free_head;
 169static size_t tnode_free_size;
 170
 171/*
 172 * synchronize_rcu after call_rcu for that many pages; it should be especially
 173 * useful before resizing the root node with PREEMPT_NONE configs; the value was
 174 * obtained experimentally, aiming to avoid visible slowdown.
 175 */
 176static const int sync_pages = 128;
 177
 178static struct kmem_cache *fn_alias_kmem __read_mostly;
 179static struct kmem_cache *trie_leaf_kmem __read_mostly;
 180
 181/*
 182 * caller must hold RTNL
 183 */
 184static inline struct tnode *node_parent(const struct rt_trie_node *node)
 185{
 186        unsigned long parent;
 187
 188        parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
 189
 190        return (struct tnode *)(parent & ~NODE_TYPE_MASK);
 191}
 192
 193/*
 194 * caller must hold RCU read lock or RTNL
 195 */
 196static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
 197{
 198        unsigned long parent;
 199
 200        parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
 201                                                           lockdep_rtnl_is_held());
 202
 203        return (struct tnode *)(parent & ~NODE_TYPE_MASK);
 204}
 205
 206/* Same as rcu_assign_pointer
 207 * but that macro() assumes that value is a pointer.
 208 */
 209static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
 210{
 211        smp_wmb();
 212        node->parent = (unsigned long)ptr | NODE_TYPE(node);
 213}
 214
 215/*
 216 * caller must hold RTNL
 217 */
 218static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
 219{
 220        BUG_ON(i >= 1U << tn->bits);
 221
 222        return rtnl_dereference(tn->child[i]);
 223}
 224
 225/*
 226 * caller must hold RCU read lock or RTNL
 227 */
 228static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
 229{
 230        BUG_ON(i >= 1U << tn->bits);
 231
 232        return rcu_dereference_rtnl(tn->child[i]);
 233}
 234
 235static inline int tnode_child_length(const struct tnode *tn)
 236{
 237        return 1 << tn->bits;
 238}
 239
 240static inline t_key mask_pfx(t_key k, unsigned int l)
 241{
 242        return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
 243}
 244
 245static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
 246{
 247        if (offset < KEYLENGTH)
 248                return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
 249        else
 250                return 0;
 251}
 252
 253static inline int tkey_equals(t_key a, t_key b)
 254{
 255        return a == b;
 256}
 257
 258static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
 259{
 260        if (bits == 0 || offset >= KEYLENGTH)
 261                return 1;
 262        bits = bits > KEYLENGTH ? KEYLENGTH : bits;
 263        return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
 264}
 265
 266static inline int tkey_mismatch(t_key a, int offset, t_key b)
 267{
 268        t_key diff = a ^ b;
 269        int i = offset;
 270
 271        if (!diff)
 272                return 0;
 273        while ((diff << i) >> (KEYLENGTH-1) == 0)
 274                i++;
 275        return i;
 276}
 277
 278/*
 279  To understand this stuff, an understanding of keys and all their bits is
 280  necessary. Every node in the trie has a key associated with it, but not
 281  all of the bits in that key are significant.
 282
 283  Consider a node 'n' and its parent 'tp'.
 284
 285  If n is a leaf, every bit in its key is significant. Its presence is
 286  necessitated by path compression, since during a tree traversal (when
 287  searching for a leaf - unless we are doing an insertion) we will completely
 288  ignore all skipped bits we encounter. Thus we need to verify, at the end of
 289  a potentially successful search, that we have indeed been walking the
 290  correct key path.
 291
 292  Note that we can never "miss" the correct key in the tree if present by
 293  following the wrong path. Path compression ensures that segments of the key
 294  that are the same for all keys with a given prefix are skipped, but the
 295  skipped part *is* identical for each node in the subtrie below the skipped
 296  bit! trie_insert() in this implementation takes care of that - note the
 297  call to tkey_sub_equals() in trie_insert().
 298
 299  if n is an internal node - a 'tnode' here, the various parts of its key
 300  have many different meanings.
 301
 302  Example:
 303  _________________________________________________________________
 304  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
 305  -----------------------------------------------------------------
 306    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 307
 308  _________________________________________________________________
 309  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
 310  -----------------------------------------------------------------
 311   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
 312
 313  tp->pos = 7
 314  tp->bits = 3
 315  n->pos = 15
 316  n->bits = 4
 317
 318  First, let's just ignore the bits that come before the parent tp, that is
 319  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
 320  not use them for anything.
 321
 322  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
 323  index into the parent's child array. That is, they will be used to find
 324  'n' among tp's children.
 325
 326  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
 327  for the node n.
 328
 329  All the bits we have seen so far are significant to the node n. The rest
 330  of the bits are really not needed or indeed known in n->key.
 331
 332  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
 333  n's child array, and will of course be different for each child.
 334
 335
 336  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
 337  at this point.
 338
 339*/
 340
 341static inline void check_tnode(const struct tnode *tn)
 342{
 343        WARN_ON(tn && tn->pos+tn->bits > 32);
 344}
 345
 346static const int halve_threshold = 25;
 347static const int inflate_threshold = 50;
 348static const int halve_threshold_root = 15;
 349static const int inflate_threshold_root = 30;
 350
 351static void __alias_free_mem(struct rcu_head *head)
 352{
 353        struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
 354        kmem_cache_free(fn_alias_kmem, fa);
 355}
 356
 357static inline void alias_free_mem_rcu(struct fib_alias *fa)
 358{
 359        call_rcu(&fa->rcu, __alias_free_mem);
 360}
 361
 362static void __leaf_free_rcu(struct rcu_head *head)
 363{
 364        struct leaf *l = container_of(head, struct leaf, rcu);
 365        kmem_cache_free(trie_leaf_kmem, l);
 366}
 367
 368static inline void free_leaf(struct leaf *l)
 369{
 370        call_rcu(&l->rcu, __leaf_free_rcu);
 371}
 372
 373static inline void free_leaf_info(struct leaf_info *leaf)
 374{
 375        kfree_rcu(leaf, rcu);
 376}
 377
 378static struct tnode *tnode_alloc(size_t size)
 379{
 380        if (size <= PAGE_SIZE)
 381                return kzalloc(size, GFP_KERNEL);
 382        else
 383                return vzalloc(size);
 384}
 385
 386static void __tnode_vfree(struct work_struct *arg)
 387{
 388        struct tnode *tn = container_of(arg, struct tnode, work);
 389        vfree(tn);
 390}
 391
 392static void __tnode_free_rcu(struct rcu_head *head)
 393{
 394        struct tnode *tn = container_of(head, struct tnode, rcu);
 395        size_t size = sizeof(struct tnode) +
 396                      (sizeof(struct rt_trie_node *) << tn->bits);
 397
 398        if (size <= PAGE_SIZE)
 399                kfree(tn);
 400        else {
 401                INIT_WORK(&tn->work, __tnode_vfree);
 402                schedule_work(&tn->work);
 403        }
 404}
 405
 406static inline void tnode_free(struct tnode *tn)
 407{
 408        if (IS_LEAF(tn))
 409                free_leaf((struct leaf *) tn);
 410        else
 411                call_rcu(&tn->rcu, __tnode_free_rcu);
 412}
 413
 414static void tnode_free_safe(struct tnode *tn)
 415{
 416        BUG_ON(IS_LEAF(tn));
 417        tn->tnode_free = tnode_free_head;
 418        tnode_free_head = tn;
 419        tnode_free_size += sizeof(struct tnode) +
 420                           (sizeof(struct rt_trie_node *) << tn->bits);
 421}
 422
 423static void tnode_free_flush(void)
 424{
 425        struct tnode *tn;
 426
 427        while ((tn = tnode_free_head)) {
 428                tnode_free_head = tn->tnode_free;
 429                tn->tnode_free = NULL;
 430                tnode_free(tn);
 431        }
 432
 433        if (tnode_free_size >= PAGE_SIZE * sync_pages) {
 434                tnode_free_size = 0;
 435                synchronize_rcu();
 436        }
 437}
 438
 439static struct leaf *leaf_new(void)
 440{
 441        struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
 442        if (l) {
 443                l->parent = T_LEAF;
 444                INIT_HLIST_HEAD(&l->list);
 445        }
 446        return l;
 447}
 448
 449static struct leaf_info *leaf_info_new(int plen)
 450{
 451        struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
 452        if (li) {
 453                li->plen = plen;
 454                li->mask_plen = ntohl(inet_make_mask(plen));
 455                INIT_LIST_HEAD(&li->falh);
 456        }
 457        return li;
 458}
 459
 460static struct tnode *tnode_new(t_key key, int pos, int bits)
 461{
 462        size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
 463        struct tnode *tn = tnode_alloc(sz);
 464
 465        if (tn) {
 466                tn->parent = T_TNODE;
 467                tn->pos = pos;
 468                tn->bits = bits;
 469                tn->key = key;
 470                tn->full_children = 0;
 471                tn->empty_children = 1<<bits;
 472        }
 473
 474        pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
 475                 sizeof(struct rt_trie_node *) << bits);
 476        return tn;
 477}
 478
 479/*
 480 * Check whether a tnode 'n' is "full", i.e. it is an internal node
 481 * and no bits are skipped. See discussion in dyntree paper p. 6
 482 */
 483
 484static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
 485{
 486        if (n == NULL || IS_LEAF(n))
 487                return 0;
 488
 489        return ((struct tnode *) n)->pos == tn->pos + tn->bits;
 490}
 491
 492static inline void put_child(struct tnode *tn, int i,
 493                             struct rt_trie_node *n)
 494{
 495        tnode_put_child_reorg(tn, i, n, -1);
 496}
 497
 498 /*
 499  * Add a child at position i overwriting the old value.
 500  * Update the value of full_children and empty_children.
 501  */
 502
 503static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
 504                                  int wasfull)
 505{
 506        struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
 507        int isfull;
 508
 509        BUG_ON(i >= 1<<tn->bits);
 510
 511        /* update emptyChildren */
 512        if (n == NULL && chi != NULL)
 513                tn->empty_children++;
 514        else if (n != NULL && chi == NULL)
 515                tn->empty_children--;
 516
 517        /* update fullChildren */
 518        if (wasfull == -1)
 519                wasfull = tnode_full(tn, chi);
 520
 521        isfull = tnode_full(tn, n);
 522        if (wasfull && !isfull)
 523                tn->full_children--;
 524        else if (!wasfull && isfull)
 525                tn->full_children++;
 526
 527        if (n)
 528                node_set_parent(n, tn);
 529
 530        rcu_assign_pointer(tn->child[i], n);
 531}
 532
 533#define MAX_WORK 10
 534static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
 535{
 536        int i;
 537        struct tnode *old_tn;
 538        int inflate_threshold_use;
 539        int halve_threshold_use;
 540        int max_work;
 541
 542        if (!tn)
 543                return NULL;
 544
 545        pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
 546                 tn, inflate_threshold, halve_threshold);
 547
 548        /* No children */
 549        if (tn->empty_children == tnode_child_length(tn)) {
 550                tnode_free_safe(tn);
 551                return NULL;
 552        }
 553        /* One child */
 554        if (tn->empty_children == tnode_child_length(tn) - 1)
 555                goto one_child;
 556        /*
 557         * Double as long as the resulting node has a number of
 558         * nonempty nodes that are above the threshold.
 559         */
 560
 561        /*
 562         * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
 563         * the Helsinki University of Technology and Matti Tikkanen of Nokia
 564         * Telecommunications, page 6:
 565         * "A node is doubled if the ratio of non-empty children to all
 566         * children in the *doubled* node is at least 'high'."
 567         *
 568         * 'high' in this instance is the variable 'inflate_threshold'. It
 569         * is expressed as a percentage, so we multiply it with
 570         * tnode_child_length() and instead of multiplying by 2 (since the
 571         * child array will be doubled by inflate()) and multiplying
 572         * the left-hand side by 100 (to handle the percentage thing) we
 573         * multiply the left-hand side by 50.
 574         *
 575         * The left-hand side may look a bit weird: tnode_child_length(tn)
 576         * - tn->empty_children is of course the number of non-null children
 577         * in the current node. tn->full_children is the number of "full"
 578         * children, that is non-null tnodes with a skip value of 0.
 579         * All of those will be doubled in the resulting inflated tnode, so
 580         * we just count them one extra time here.
 581         *
 582         * A clearer way to write this would be:
 583         *
 584         * to_be_doubled = tn->full_children;
 585         * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
 586         *     tn->full_children;
 587         *
 588         * new_child_length = tnode_child_length(tn) * 2;
 589         *
 590         * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
 591         *      new_child_length;
 592         * if (new_fill_factor >= inflate_threshold)
 593         *
 594         * ...and so on, tho it would mess up the while () loop.
 595         *
 596         * anyway,
 597         * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
 598         *      inflate_threshold
 599         *
 600         * avoid a division:
 601         * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
 602         *      inflate_threshold * new_child_length
 603         *
 604         * expand not_to_be_doubled and to_be_doubled, and shorten:
 605         * 100 * (tnode_child_length(tn) - tn->empty_children +
 606         *    tn->full_children) >= inflate_threshold * new_child_length
 607         *
 608         * expand new_child_length:
 609         * 100 * (tnode_child_length(tn) - tn->empty_children +
 610         *    tn->full_children) >=
 611         *      inflate_threshold * tnode_child_length(tn) * 2
 612         *
 613         * shorten again:
 614         * 50 * (tn->full_children + tnode_child_length(tn) -
 615         *    tn->empty_children) >= inflate_threshold *
 616         *    tnode_child_length(tn)
 617         *
 618         */
 619
 620        check_tnode(tn);
 621
 622        /* Keep root node larger  */
 623
 624        if (!node_parent((struct rt_trie_node *)tn)) {
 625                inflate_threshold_use = inflate_threshold_root;
 626                halve_threshold_use = halve_threshold_root;
 627        } else {
 628                inflate_threshold_use = inflate_threshold;
 629                halve_threshold_use = halve_threshold;
 630        }
 631
 632        max_work = MAX_WORK;
 633        while ((tn->full_children > 0 &&  max_work-- &&
 634                50 * (tn->full_children + tnode_child_length(tn)
 635                      - tn->empty_children)
 636                >= inflate_threshold_use * tnode_child_length(tn))) {
 637
 638                old_tn = tn;
 639                tn = inflate(t, tn);
 640
 641                if (IS_ERR(tn)) {
 642                        tn = old_tn;
 643#ifdef CONFIG_IP_FIB_TRIE_STATS
 644                        t->stats.resize_node_skipped++;
 645#endif
 646                        break;
 647                }
 648        }
 649
 650        check_tnode(tn);
 651
 652        /* Return if at least one inflate is run */
 653        if (max_work != MAX_WORK)
 654                return (struct rt_trie_node *) tn;
 655
 656        /*
 657         * Halve as long as the number of empty children in this
 658         * node is above threshold.
 659         */
 660
 661        max_work = MAX_WORK;
 662        while (tn->bits > 1 &&  max_work-- &&
 663               100 * (tnode_child_length(tn) - tn->empty_children) <
 664               halve_threshold_use * tnode_child_length(tn)) {
 665
 666                old_tn = tn;
 667                tn = halve(t, tn);
 668                if (IS_ERR(tn)) {
 669                        tn = old_tn;
 670#ifdef CONFIG_IP_FIB_TRIE_STATS
 671                        t->stats.resize_node_skipped++;
 672#endif
 673                        break;
 674                }
 675        }
 676
 677
 678        /* Only one child remains */
 679        if (tn->empty_children == tnode_child_length(tn) - 1) {
 680one_child:
 681                for (i = 0; i < tnode_child_length(tn); i++) {
 682                        struct rt_trie_node *n;
 683
 684                        n = rtnl_dereference(tn->child[i]);
 685                        if (!n)
 686                                continue;
 687
 688                        /* compress one level */
 689
 690                        node_set_parent(n, NULL);
 691                        tnode_free_safe(tn);
 692                        return n;
 693                }
 694        }
 695        return (struct rt_trie_node *) tn;
 696}
 697
 698
 699static void tnode_clean_free(struct tnode *tn)
 700{
 701        int i;
 702        struct tnode *tofree;
 703
 704        for (i = 0; i < tnode_child_length(tn); i++) {
 705                tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
 706                if (tofree)
 707                        tnode_free(tofree);
 708        }
 709        tnode_free(tn);
 710}
 711
 712static struct tnode *inflate(struct trie *t, struct tnode *tn)
 713{
 714        struct tnode *oldtnode = tn;
 715        int olen = tnode_child_length(tn);
 716        int i;
 717
 718        pr_debug("In inflate\n");
 719
 720        tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
 721
 722        if (!tn)
 723                return ERR_PTR(-ENOMEM);
 724
 725        /*
 726         * Preallocate and store tnodes before the actual work so we
 727         * don't get into an inconsistent state if memory allocation
 728         * fails. In case of failure we return the oldnode and  inflate
 729         * of tnode is ignored.
 730         */
 731
 732        for (i = 0; i < olen; i++) {
 733                struct tnode *inode;
 734
 735                inode = (struct tnode *) tnode_get_child(oldtnode, i);
 736                if (inode &&
 737                    IS_TNODE(inode) &&
 738                    inode->pos == oldtnode->pos + oldtnode->bits &&
 739                    inode->bits > 1) {
 740                        struct tnode *left, *right;
 741                        t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
 742
 743                        left = tnode_new(inode->key&(~m), inode->pos + 1,
 744                                         inode->bits - 1);
 745                        if (!left)
 746                                goto nomem;
 747
 748                        right = tnode_new(inode->key|m, inode->pos + 1,
 749                                          inode->bits - 1);
 750
 751                        if (!right) {
 752                                tnode_free(left);
 753                                goto nomem;
 754                        }
 755
 756                        put_child(tn, 2*i, (struct rt_trie_node *) left);
 757                        put_child(tn, 2*i+1, (struct rt_trie_node *) right);
 758                }
 759        }
 760
 761        for (i = 0; i < olen; i++) {
 762                struct tnode *inode;
 763                struct rt_trie_node *node = tnode_get_child(oldtnode, i);
 764                struct tnode *left, *right;
 765                int size, j;
 766
 767                /* An empty child */
 768                if (node == NULL)
 769                        continue;
 770
 771                /* A leaf or an internal node with skipped bits */
 772
 773                if (IS_LEAF(node) || ((struct tnode *) node)->pos >
 774                   tn->pos + tn->bits - 1) {
 775                        if (tkey_extract_bits(node->key,
 776                                              oldtnode->pos + oldtnode->bits,
 777                                              1) == 0)
 778                                put_child(tn, 2*i, node);
 779                        else
 780                                put_child(tn, 2*i+1, node);
 781                        continue;
 782                }
 783
 784                /* An internal node with two children */
 785                inode = (struct tnode *) node;
 786
 787                if (inode->bits == 1) {
 788                        put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
 789                        put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
 790
 791                        tnode_free_safe(inode);
 792                        continue;
 793                }
 794
 795                /* An internal node with more than two children */
 796
 797                /* We will replace this node 'inode' with two new
 798                 * ones, 'left' and 'right', each with half of the
 799                 * original children. The two new nodes will have
 800                 * a position one bit further down the key and this
 801                 * means that the "significant" part of their keys
 802                 * (see the discussion near the top of this file)
 803                 * will differ by one bit, which will be "0" in
 804                 * left's key and "1" in right's key. Since we are
 805                 * moving the key position by one step, the bit that
 806                 * we are moving away from - the bit at position
 807                 * (inode->pos) - is the one that will differ between
 808                 * left and right. So... we synthesize that bit in the
 809                 * two  new keys.
 810                 * The mask 'm' below will be a single "one" bit at
 811                 * the position (inode->pos)
 812                 */
 813
 814                /* Use the old key, but set the new significant
 815                 *   bit to zero.
 816                 */
 817
 818                left = (struct tnode *) tnode_get_child(tn, 2*i);
 819                put_child(tn, 2*i, NULL);
 820
 821                BUG_ON(!left);
 822
 823                right = (struct tnode *) tnode_get_child(tn, 2*i+1);
 824                put_child(tn, 2*i+1, NULL);
 825
 826                BUG_ON(!right);
 827
 828                size = tnode_child_length(left);
 829                for (j = 0; j < size; j++) {
 830                        put_child(left, j, rtnl_dereference(inode->child[j]));
 831                        put_child(right, j, rtnl_dereference(inode->child[j + size]));
 832                }
 833                put_child(tn, 2*i, resize(t, left));
 834                put_child(tn, 2*i+1, resize(t, right));
 835
 836                tnode_free_safe(inode);
 837        }
 838        tnode_free_safe(oldtnode);
 839        return tn;
 840nomem:
 841        tnode_clean_free(tn);
 842        return ERR_PTR(-ENOMEM);
 843}
 844
 845static struct tnode *halve(struct trie *t, struct tnode *tn)
 846{
 847        struct tnode *oldtnode = tn;
 848        struct rt_trie_node *left, *right;
 849        int i;
 850        int olen = tnode_child_length(tn);
 851
 852        pr_debug("In halve\n");
 853
 854        tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
 855
 856        if (!tn)
 857                return ERR_PTR(-ENOMEM);
 858
 859        /*
 860         * Preallocate and store tnodes before the actual work so we
 861         * don't get into an inconsistent state if memory allocation
 862         * fails. In case of failure we return the oldnode and halve
 863         * of tnode is ignored.
 864         */
 865
 866        for (i = 0; i < olen; i += 2) {
 867                left = tnode_get_child(oldtnode, i);
 868                right = tnode_get_child(oldtnode, i+1);
 869
 870                /* Two nonempty children */
 871                if (left && right) {
 872                        struct tnode *newn;
 873
 874                        newn = tnode_new(left->key, tn->pos + tn->bits, 1);
 875
 876                        if (!newn)
 877                                goto nomem;
 878
 879                        put_child(tn, i/2, (struct rt_trie_node *)newn);
 880                }
 881
 882        }
 883
 884        for (i = 0; i < olen; i += 2) {
 885                struct tnode *newBinNode;
 886
 887                left = tnode_get_child(oldtnode, i);
 888                right = tnode_get_child(oldtnode, i+1);
 889
 890                /* At least one of the children is empty */
 891                if (left == NULL) {
 892                        if (right == NULL)    /* Both are empty */
 893                                continue;
 894                        put_child(tn, i/2, right);
 895                        continue;
 896                }
 897
 898                if (right == NULL) {
 899                        put_child(tn, i/2, left);
 900                        continue;
 901                }
 902
 903                /* Two nonempty children */
 904                newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
 905                put_child(tn, i/2, NULL);
 906                put_child(newBinNode, 0, left);
 907                put_child(newBinNode, 1, right);
 908                put_child(tn, i/2, resize(t, newBinNode));
 909        }
 910        tnode_free_safe(oldtnode);
 911        return tn;
 912nomem:
 913        tnode_clean_free(tn);
 914        return ERR_PTR(-ENOMEM);
 915}
 916
 917/* readside must use rcu_read_lock currently dump routines
 918 via get_fa_head and dump */
 919
 920static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
 921{
 922        struct hlist_head *head = &l->list;
 923        struct hlist_node *node;
 924        struct leaf_info *li;
 925
 926        hlist_for_each_entry_rcu(li, node, head, hlist)
 927                if (li->plen == plen)
 928                        return li;
 929
 930        return NULL;
 931}
 932
 933static inline struct list_head *get_fa_head(struct leaf *l, int plen)
 934{
 935        struct leaf_info *li = find_leaf_info(l, plen);
 936
 937        if (!li)
 938                return NULL;
 939
 940        return &li->falh;
 941}
 942
 943static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
 944{
 945        struct leaf_info *li = NULL, *last = NULL;
 946        struct hlist_node *node;
 947
 948        if (hlist_empty(head)) {
 949                hlist_add_head_rcu(&new->hlist, head);
 950        } else {
 951                hlist_for_each_entry(li, node, head, hlist) {
 952                        if (new->plen > li->plen)
 953                                break;
 954
 955                        last = li;
 956                }
 957                if (last)
 958                        hlist_add_after_rcu(&last->hlist, &new->hlist);
 959                else
 960                        hlist_add_before_rcu(&new->hlist, &li->hlist);
 961        }
 962}
 963
 964/* rcu_read_lock needs to be hold by caller from readside */
 965
 966static struct leaf *
 967fib_find_node(struct trie *t, u32 key)
 968{
 969        int pos;
 970        struct tnode *tn;
 971        struct rt_trie_node *n;
 972
 973        pos = 0;
 974        n = rcu_dereference_rtnl(t->trie);
 975
 976        while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
 977                tn = (struct tnode *) n;
 978
 979                check_tnode(tn);
 980
 981                if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
 982                        pos = tn->pos + tn->bits;
 983                        n = tnode_get_child_rcu(tn,
 984                                                tkey_extract_bits(key,
 985                                                                  tn->pos,
 986                                                                  tn->bits));
 987                } else
 988                        break;
 989        }
 990        /* Case we have found a leaf. Compare prefixes */
 991
 992        if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
 993                return (struct leaf *)n;
 994
 995        return NULL;
 996}
 997
 998static void trie_rebalance(struct trie *t, struct tnode *tn)
 999{
1000        int wasfull;
1001        t_key cindex, key;
1002        struct tnode *tp;
1003
1004        key = tn->key;
1005
1006        while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
1007                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1008                wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1009                tn = (struct tnode *)resize(t, tn);
1010
1011                tnode_put_child_reorg(tp, cindex,
1012                                      (struct rt_trie_node *)tn, wasfull);
1013
1014                tp = node_parent((struct rt_trie_node *) tn);
1015                if (!tp)
1016                        rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1017
1018                tnode_free_flush();
1019                if (!tp)
1020                        break;
1021                tn = tp;
1022        }
1023
1024        /* Handle last (top) tnode */
1025        if (IS_TNODE(tn))
1026                tn = (struct tnode *)resize(t, tn);
1027
1028        rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1029        tnode_free_flush();
1030}
1031
1032/* only used from updater-side */
1033
1034static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1035{
1036        int pos, newpos;
1037        struct tnode *tp = NULL, *tn = NULL;
1038        struct rt_trie_node *n;
1039        struct leaf *l;
1040        int missbit;
1041        struct list_head *fa_head = NULL;
1042        struct leaf_info *li;
1043        t_key cindex;
1044
1045        pos = 0;
1046        n = rtnl_dereference(t->trie);
1047
1048        /* If we point to NULL, stop. Either the tree is empty and we should
1049         * just put a new leaf in if, or we have reached an empty child slot,
1050         * and we should just put our new leaf in that.
1051         * If we point to a T_TNODE, check if it matches our key. Note that
1052         * a T_TNODE might be skipping any number of bits - its 'pos' need
1053         * not be the parent's 'pos'+'bits'!
1054         *
1055         * If it does match the current key, get pos/bits from it, extract
1056         * the index from our key, push the T_TNODE and walk the tree.
1057         *
1058         * If it doesn't, we have to replace it with a new T_TNODE.
1059         *
1060         * If we point to a T_LEAF, it might or might not have the same key
1061         * as we do. If it does, just change the value, update the T_LEAF's
1062         * value, and return it.
1063         * If it doesn't, we need to replace it with a T_TNODE.
1064         */
1065
1066        while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1067                tn = (struct tnode *) n;
1068
1069                check_tnode(tn);
1070
1071                if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1072                        tp = tn;
1073                        pos = tn->pos + tn->bits;
1074                        n = tnode_get_child(tn,
1075                                            tkey_extract_bits(key,
1076                                                              tn->pos,
1077                                                              tn->bits));
1078
1079                        BUG_ON(n && node_parent(n) != tn);
1080                } else
1081                        break;
1082        }
1083
1084        /*
1085         * n  ----> NULL, LEAF or TNODE
1086         *
1087         * tp is n's (parent) ----> NULL or TNODE
1088         */
1089
1090        BUG_ON(tp && IS_LEAF(tp));
1091
1092        /* Case 1: n is a leaf. Compare prefixes */
1093
1094        if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1095                l = (struct leaf *) n;
1096                li = leaf_info_new(plen);
1097
1098                if (!li)
1099                        return NULL;
1100
1101                fa_head = &li->falh;
1102                insert_leaf_info(&l->list, li);
1103                goto done;
1104        }
1105        l = leaf_new();
1106
1107        if (!l)
1108                return NULL;
1109
1110        l->key = key;
1111        li = leaf_info_new(plen);
1112
1113        if (!li) {
1114                free_leaf(l);
1115                return NULL;
1116        }
1117
1118        fa_head = &li->falh;
1119        insert_leaf_info(&l->list, li);
1120
1121        if (t->trie && n == NULL) {
1122                /* Case 2: n is NULL, and will just insert a new leaf */
1123
1124                node_set_parent((struct rt_trie_node *)l, tp);
1125
1126                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1127                put_child(tp, cindex, (struct rt_trie_node *)l);
1128        } else {
1129                /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1130                /*
1131                 *  Add a new tnode here
1132                 *  first tnode need some special handling
1133                 */
1134
1135                if (tp)
1136                        pos = tp->pos+tp->bits;
1137                else
1138                        pos = 0;
1139
1140                if (n) {
1141                        newpos = tkey_mismatch(key, pos, n->key);
1142                        tn = tnode_new(n->key, newpos, 1);
1143                } else {
1144                        newpos = 0;
1145                        tn = tnode_new(key, newpos, 1); /* First tnode */
1146                }
1147
1148                if (!tn) {
1149                        free_leaf_info(li);
1150                        free_leaf(l);
1151                        return NULL;
1152                }
1153
1154                node_set_parent((struct rt_trie_node *)tn, tp);
1155
1156                missbit = tkey_extract_bits(key, newpos, 1);
1157                put_child(tn, missbit, (struct rt_trie_node *)l);
1158                put_child(tn, 1-missbit, n);
1159
1160                if (tp) {
1161                        cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1162                        put_child(tp, cindex, (struct rt_trie_node *)tn);
1163                } else {
1164                        rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1165                        tp = tn;
1166                }
1167        }
1168
1169        if (tp && tp->pos + tp->bits > 32)
1170                pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1171                        tp, tp->pos, tp->bits, key, plen);
1172
1173        /* Rebalance the trie */
1174
1175        trie_rebalance(t, tp);
1176done:
1177        return fa_head;
1178}
1179
1180/*
1181 * Caller must hold RTNL.
1182 */
1183int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1184{
1185        struct trie *t = (struct trie *) tb->tb_data;
1186        struct fib_alias *fa, *new_fa;
1187        struct list_head *fa_head = NULL;
1188        struct fib_info *fi;
1189        int plen = cfg->fc_dst_len;
1190        u8 tos = cfg->fc_tos;
1191        u32 key, mask;
1192        int err;
1193        struct leaf *l;
1194
1195        if (plen > 32)
1196                return -EINVAL;
1197
1198        key = ntohl(cfg->fc_dst);
1199
1200        pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1201
1202        mask = ntohl(inet_make_mask(plen));
1203
1204        if (key & ~mask)
1205                return -EINVAL;
1206
1207        key = key & mask;
1208
1209        fi = fib_create_info(cfg);
1210        if (IS_ERR(fi)) {
1211                err = PTR_ERR(fi);
1212                goto err;
1213        }
1214
1215        l = fib_find_node(t, key);
1216        fa = NULL;
1217
1218        if (l) {
1219                fa_head = get_fa_head(l, plen);
1220                fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1221        }
1222
1223        /* Now fa, if non-NULL, points to the first fib alias
1224         * with the same keys [prefix,tos,priority], if such key already
1225         * exists or to the node before which we will insert new one.
1226         *
1227         * If fa is NULL, we will need to allocate a new one and
1228         * insert to the head of f.
1229         *
1230         * If f is NULL, no fib node matched the destination key
1231         * and we need to allocate a new one of those as well.
1232         */
1233
1234        if (fa && fa->fa_tos == tos &&
1235            fa->fa_info->fib_priority == fi->fib_priority) {
1236                struct fib_alias *fa_first, *fa_match;
1237
1238                err = -EEXIST;
1239                if (cfg->fc_nlflags & NLM_F_EXCL)
1240                        goto out;
1241
1242                /* We have 2 goals:
1243                 * 1. Find exact match for type, scope, fib_info to avoid
1244                 * duplicate routes
1245                 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1246                 */
1247                fa_match = NULL;
1248                fa_first = fa;
1249                fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1250                list_for_each_entry_continue(fa, fa_head, fa_list) {
1251                        if (fa->fa_tos != tos)
1252                                break;
1253                        if (fa->fa_info->fib_priority != fi->fib_priority)
1254                                break;
1255                        if (fa->fa_type == cfg->fc_type &&
1256                            fa->fa_info == fi) {
1257                                fa_match = fa;
1258                                break;
1259                        }
1260                }
1261
1262                if (cfg->fc_nlflags & NLM_F_REPLACE) {
1263                        struct fib_info *fi_drop;
1264                        u8 state;
1265
1266                        fa = fa_first;
1267                        if (fa_match) {
1268                                if (fa == fa_match)
1269                                        err = 0;
1270                                goto out;
1271                        }
1272                        err = -ENOBUFS;
1273                        new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1274                        if (new_fa == NULL)
1275                                goto out;
1276
1277                        fi_drop = fa->fa_info;
1278                        new_fa->fa_tos = fa->fa_tos;
1279                        new_fa->fa_info = fi;
1280                        new_fa->fa_type = cfg->fc_type;
1281                        state = fa->fa_state;
1282                        new_fa->fa_state = state & ~FA_S_ACCESSED;
1283
1284                        list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1285                        alias_free_mem_rcu(fa);
1286
1287                        fib_release_info(fi_drop);
1288                        if (state & FA_S_ACCESSED)
1289                                rt_cache_flush(cfg->fc_nlinfo.nl_net);
1290                        rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1291                                tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1292
1293                        goto succeeded;
1294                }
1295                /* Error if we find a perfect match which
1296                 * uses the same scope, type, and nexthop
1297                 * information.
1298                 */
1299                if (fa_match)
1300                        goto out;
1301
1302                if (!(cfg->fc_nlflags & NLM_F_APPEND))
1303                        fa = fa_first;
1304        }
1305        err = -ENOENT;
1306        if (!(cfg->fc_nlflags & NLM_F_CREATE))
1307                goto out;
1308
1309        err = -ENOBUFS;
1310        new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1311        if (new_fa == NULL)
1312                goto out;
1313
1314        new_fa->fa_info = fi;
1315        new_fa->fa_tos = tos;
1316        new_fa->fa_type = cfg->fc_type;
1317        new_fa->fa_state = 0;
1318        /*
1319         * Insert new entry to the list.
1320         */
1321
1322        if (!fa_head) {
1323                fa_head = fib_insert_node(t, key, plen);
1324                if (unlikely(!fa_head)) {
1325                        err = -ENOMEM;
1326                        goto out_free_new_fa;
1327                }
1328        }
1329
1330        if (!plen)
1331                tb->tb_num_default++;
1332
1333        list_add_tail_rcu(&new_fa->fa_list,
1334                          (fa ? &fa->fa_list : fa_head));
1335
1336        rt_cache_flush(cfg->fc_nlinfo.nl_net);
1337        rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1338                  &cfg->fc_nlinfo, 0);
1339succeeded:
1340        return 0;
1341
1342out_free_new_fa:
1343        kmem_cache_free(fn_alias_kmem, new_fa);
1344out:
1345        fib_release_info(fi);
1346err:
1347        return err;
1348}
1349
1350/* should be called with rcu_read_lock */
1351static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1352                      t_key key,  const struct flowi4 *flp,
1353                      struct fib_result *res, int fib_flags)
1354{
1355        struct leaf_info *li;
1356        struct hlist_head *hhead = &l->list;
1357        struct hlist_node *node;
1358
1359        hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1360                struct fib_alias *fa;
1361
1362                if (l->key != (key & li->mask_plen))
1363                        continue;
1364
1365                list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1366                        struct fib_info *fi = fa->fa_info;
1367                        int nhsel, err;
1368
1369                        if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1370                                continue;
1371                        if (fi->fib_dead)
1372                                continue;
1373                        if (fa->fa_info->fib_scope < flp->flowi4_scope)
1374                                continue;
1375                        fib_alias_accessed(fa);
1376                        err = fib_props[fa->fa_type].error;
1377                        if (err) {
1378#ifdef CONFIG_IP_FIB_TRIE_STATS
1379                                t->stats.semantic_match_passed++;
1380#endif
1381                                return err;
1382                        }
1383                        if (fi->fib_flags & RTNH_F_DEAD)
1384                                continue;
1385                        for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1386                                const struct fib_nh *nh = &fi->fib_nh[nhsel];
1387
1388                                if (nh->nh_flags & RTNH_F_DEAD)
1389                                        continue;
1390                                if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1391                                        continue;
1392
1393#ifdef CONFIG_IP_FIB_TRIE_STATS
1394                                t->stats.semantic_match_passed++;
1395#endif
1396                                res->prefixlen = li->plen;
1397                                res->nh_sel = nhsel;
1398                                res->type = fa->fa_type;
1399                                res->scope = fa->fa_info->fib_scope;
1400                                res->fi = fi;
1401                                res->table = tb;
1402                                res->fa_head = &li->falh;
1403                                if (!(fib_flags & FIB_LOOKUP_NOREF))
1404                                        atomic_inc(&fi->fib_clntref);
1405                                return 0;
1406                        }
1407                }
1408
1409#ifdef CONFIG_IP_FIB_TRIE_STATS
1410                t->stats.semantic_match_miss++;
1411#endif
1412        }
1413
1414        return 1;
1415}
1416
1417int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1418                     struct fib_result *res, int fib_flags)
1419{
1420        struct trie *t = (struct trie *) tb->tb_data;
1421        int ret;
1422        struct rt_trie_node *n;
1423        struct tnode *pn;
1424        unsigned int pos, bits;
1425        t_key key = ntohl(flp->daddr);
1426        unsigned int chopped_off;
1427        t_key cindex = 0;
1428        unsigned int current_prefix_length = KEYLENGTH;
1429        struct tnode *cn;
1430        t_key pref_mismatch;
1431
1432        rcu_read_lock();
1433
1434        n = rcu_dereference(t->trie);
1435        if (!n)
1436                goto failed;
1437
1438#ifdef CONFIG_IP_FIB_TRIE_STATS
1439        t->stats.gets++;
1440#endif
1441
1442        /* Just a leaf? */
1443        if (IS_LEAF(n)) {
1444                ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1445                goto found;
1446        }
1447
1448        pn = (struct tnode *) n;
1449        chopped_off = 0;
1450
1451        while (pn) {
1452                pos = pn->pos;
1453                bits = pn->bits;
1454
1455                if (!chopped_off)
1456                        cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1457                                                   pos, bits);
1458
1459                n = tnode_get_child_rcu(pn, cindex);
1460
1461                if (n == NULL) {
1462#ifdef CONFIG_IP_FIB_TRIE_STATS
1463                        t->stats.null_node_hit++;
1464#endif
1465                        goto backtrace;
1466                }
1467
1468                if (IS_LEAF(n)) {
1469                        ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1470                        if (ret > 0)
1471                                goto backtrace;
1472                        goto found;
1473                }
1474
1475                cn = (struct tnode *)n;
1476
1477                /*
1478                 * It's a tnode, and we can do some extra checks here if we
1479                 * like, to avoid descending into a dead-end branch.
1480                 * This tnode is in the parent's child array at index
1481                 * key[p_pos..p_pos+p_bits] but potentially with some bits
1482                 * chopped off, so in reality the index may be just a
1483                 * subprefix, padded with zero at the end.
1484                 * We can also take a look at any skipped bits in this
1485                 * tnode - everything up to p_pos is supposed to be ok,
1486                 * and the non-chopped bits of the index (se previous
1487                 * paragraph) are also guaranteed ok, but the rest is
1488                 * considered unknown.
1489                 *
1490                 * The skipped bits are key[pos+bits..cn->pos].
1491                 */
1492
1493                /* If current_prefix_length < pos+bits, we are already doing
1494                 * actual prefix  matching, which means everything from
1495                 * pos+(bits-chopped_off) onward must be zero along some
1496                 * branch of this subtree - otherwise there is *no* valid
1497                 * prefix present. Here we can only check the skipped
1498                 * bits. Remember, since we have already indexed into the
1499                 * parent's child array, we know that the bits we chopped of
1500                 * *are* zero.
1501                 */
1502
1503                /* NOTA BENE: Checking only skipped bits
1504                   for the new node here */
1505
1506                if (current_prefix_length < pos+bits) {
1507                        if (tkey_extract_bits(cn->key, current_prefix_length,
1508                                                cn->pos - current_prefix_length)
1509                            || !(cn->child[0]))
1510                                goto backtrace;
1511                }
1512
1513                /*
1514                 * If chopped_off=0, the index is fully validated and we
1515                 * only need to look at the skipped bits for this, the new,
1516                 * tnode. What we actually want to do is to find out if
1517                 * these skipped bits match our key perfectly, or if we will
1518                 * have to count on finding a matching prefix further down,
1519                 * because if we do, we would like to have some way of
1520                 * verifying the existence of such a prefix at this point.
1521                 */
1522
1523                /* The only thing we can do at this point is to verify that
1524                 * any such matching prefix can indeed be a prefix to our
1525                 * key, and if the bits in the node we are inspecting that
1526                 * do not match our key are not ZERO, this cannot be true.
1527                 * Thus, find out where there is a mismatch (before cn->pos)
1528                 * and verify that all the mismatching bits are zero in the
1529                 * new tnode's key.
1530                 */
1531
1532                /*
1533                 * Note: We aren't very concerned about the piece of
1534                 * the key that precede pn->pos+pn->bits, since these
1535                 * have already been checked. The bits after cn->pos
1536                 * aren't checked since these are by definition
1537                 * "unknown" at this point. Thus, what we want to see
1538                 * is if we are about to enter the "prefix matching"
1539                 * state, and in that case verify that the skipped
1540                 * bits that will prevail throughout this subtree are
1541                 * zero, as they have to be if we are to find a
1542                 * matching prefix.
1543                 */
1544
1545                pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
1546
1547                /*
1548                 * In short: If skipped bits in this node do not match
1549                 * the search key, enter the "prefix matching"
1550                 * state.directly.
1551                 */
1552                if (pref_mismatch) {
1553                        int mp = KEYLENGTH - fls(pref_mismatch);
1554
1555                        if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
1556                                goto backtrace;
1557
1558                        if (current_prefix_length >= cn->pos)
1559                                current_prefix_length = mp;
1560                }
1561
1562                pn = (struct tnode *)n; /* Descend */
1563                chopped_off = 0;
1564                continue;
1565
1566backtrace:
1567                chopped_off++;
1568
1569                /* As zero don't change the child key (cindex) */
1570                while ((chopped_off <= pn->bits)
1571                       && !(cindex & (1<<(chopped_off-1))))
1572                        chopped_off++;
1573
1574                /* Decrease current_... with bits chopped off */
1575                if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1576                        current_prefix_length = pn->pos + pn->bits
1577                                - chopped_off;
1578
1579                /*
1580                 * Either we do the actual chop off according or if we have
1581                 * chopped off all bits in this tnode walk up to our parent.
1582                 */
1583
1584                if (chopped_off <= pn->bits) {
1585                        cindex &= ~(1 << (chopped_off-1));
1586                } else {
1587                        struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
1588                        if (!parent)
1589                                goto failed;
1590
1591                        /* Get Child's index */
1592                        cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1593                        pn = parent;
1594                        chopped_off = 0;
1595
1596#ifdef CONFIG_IP_FIB_TRIE_STATS
1597                        t->stats.backtrack++;
1598#endif
1599                        goto backtrace;
1600                }
1601        }
1602failed:
1603        ret = 1;
1604found:
1605        rcu_read_unlock();
1606        return ret;
1607}
1608EXPORT_SYMBOL_GPL(fib_table_lookup);
1609
1610/*
1611 * Remove the leaf and return parent.
1612 */
1613static void trie_leaf_remove(struct trie *t, struct leaf *l)
1614{
1615        struct tnode *tp = node_parent((struct rt_trie_node *) l);
1616
1617        pr_debug("entering trie_leaf_remove(%p)\n", l);
1618
1619        if (tp) {
1620                t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1621                put_child(tp, cindex, NULL);
1622                trie_rebalance(t, tp);
1623        } else
1624                RCU_INIT_POINTER(t->trie, NULL);
1625
1626        free_leaf(l);
1627}
1628
1629/*
1630 * Caller must hold RTNL.
1631 */
1632int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1633{
1634        struct trie *t = (struct trie *) tb->tb_data;
1635        u32 key, mask;
1636        int plen = cfg->fc_dst_len;
1637        u8 tos = cfg->fc_tos;
1638        struct fib_alias *fa, *fa_to_delete;
1639        struct list_head *fa_head;
1640        struct leaf *l;
1641        struct leaf_info *li;
1642
1643        if (plen > 32)
1644                return -EINVAL;
1645
1646        key = ntohl(cfg->fc_dst);
1647        mask = ntohl(inet_make_mask(plen));
1648
1649        if (key & ~mask)
1650                return -EINVAL;
1651
1652        key = key & mask;
1653        l = fib_find_node(t, key);
1654
1655        if (!l)
1656                return -ESRCH;
1657
1658        fa_head = get_fa_head(l, plen);
1659        fa = fib_find_alias(fa_head, tos, 0);
1660
1661        if (!fa)
1662                return -ESRCH;
1663
1664        pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1665
1666        fa_to_delete = NULL;
1667        fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1668        list_for_each_entry_continue(fa, fa_head, fa_list) {
1669                struct fib_info *fi = fa->fa_info;
1670
1671                if (fa->fa_tos != tos)
1672                        break;
1673
1674                if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1675                    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1676                     fa->fa_info->fib_scope == cfg->fc_scope) &&
1677                    (!cfg->fc_prefsrc ||
1678                     fi->fib_prefsrc == cfg->fc_prefsrc) &&
1679                    (!cfg->fc_protocol ||
1680                     fi->fib_protocol == cfg->fc_protocol) &&
1681                    fib_nh_match(cfg, fi) == 0) {
1682                        fa_to_delete = fa;
1683                        break;
1684                }
1685        }
1686
1687        if (!fa_to_delete)
1688                return -ESRCH;
1689
1690        fa = fa_to_delete;
1691        rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1692                  &cfg->fc_nlinfo, 0);
1693
1694        l = fib_find_node(t, key);
1695        li = find_leaf_info(l, plen);
1696
1697        list_del_rcu(&fa->fa_list);
1698
1699        if (!plen)
1700                tb->tb_num_default--;
1701
1702        if (list_empty(fa_head)) {
1703                hlist_del_rcu(&li->hlist);
1704                free_leaf_info(li);
1705        }
1706
1707        if (hlist_empty(&l->list))
1708                trie_leaf_remove(t, l);
1709
1710        if (fa->fa_state & FA_S_ACCESSED)
1711                rt_cache_flush(cfg->fc_nlinfo.nl_net);
1712
1713        fib_release_info(fa->fa_info);
1714        alias_free_mem_rcu(fa);
1715        return 0;
1716}
1717
1718static int trie_flush_list(struct list_head *head)
1719{
1720        struct fib_alias *fa, *fa_node;
1721        int found = 0;
1722
1723        list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1724                struct fib_info *fi = fa->fa_info;
1725
1726                if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1727                        list_del_rcu(&fa->fa_list);
1728                        fib_release_info(fa->fa_info);
1729                        alias_free_mem_rcu(fa);
1730                        found++;
1731                }
1732        }
1733        return found;
1734}
1735
1736static int trie_flush_leaf(struct leaf *l)
1737{
1738        int found = 0;
1739        struct hlist_head *lih = &l->list;
1740        struct hlist_node *node, *tmp;
1741        struct leaf_info *li = NULL;
1742
1743        hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1744                found += trie_flush_list(&li->falh);
1745
1746                if (list_empty(&li->falh)) {
1747                        hlist_del_rcu(&li->hlist);
1748                        free_leaf_info(li);
1749                }
1750        }
1751        return found;
1752}
1753
1754/*
1755 * Scan for the next right leaf starting at node p->child[idx]
1756 * Since we have back pointer, no recursion necessary.
1757 */
1758static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
1759{
1760        do {
1761                t_key idx;
1762
1763                if (c)
1764                        idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1765                else
1766                        idx = 0;
1767
1768                while (idx < 1u << p->bits) {
1769                        c = tnode_get_child_rcu(p, idx++);
1770                        if (!c)
1771                                continue;
1772
1773                        if (IS_LEAF(c)) {
1774                                prefetch(rcu_dereference_rtnl(p->child[idx]));
1775                                return (struct leaf *) c;
1776                        }
1777
1778                        /* Rescan start scanning in new node */
1779                        p = (struct tnode *) c;
1780                        idx = 0;
1781                }
1782
1783                /* Node empty, walk back up to parent */
1784                c = (struct rt_trie_node *) p;
1785        } while ((p = node_parent_rcu(c)) != NULL);
1786
1787        return NULL; /* Root of trie */
1788}
1789
1790static struct leaf *trie_firstleaf(struct trie *t)
1791{
1792        struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
1793
1794        if (!n)
1795                return NULL;
1796
1797        if (IS_LEAF(n))          /* trie is just a leaf */
1798                return (struct leaf *) n;
1799
1800        return leaf_walk_rcu(n, NULL);
1801}
1802
1803static struct leaf *trie_nextleaf(struct leaf *l)
1804{
1805        struct rt_trie_node *c = (struct rt_trie_node *) l;
1806        struct tnode *p = node_parent_rcu(c);
1807
1808        if (!p)
1809                return NULL;    /* trie with just one leaf */
1810
1811        return leaf_walk_rcu(p, c);
1812}
1813
1814static struct leaf *trie_leafindex(struct trie *t, int index)
1815{
1816        struct leaf *l = trie_firstleaf(t);
1817
1818        while (l && index-- > 0)
1819                l = trie_nextleaf(l);
1820
1821        return l;
1822}
1823
1824
1825/*
1826 * Caller must hold RTNL.
1827 */
1828int fib_table_flush(struct fib_table *tb)
1829{
1830        struct trie *t = (struct trie *) tb->tb_data;
1831        struct leaf *l, *ll = NULL;
1832        int found = 0;
1833
1834        for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1835                found += trie_flush_leaf(l);
1836
1837                if (ll && hlist_empty(&ll->list))
1838                        trie_leaf_remove(t, ll);
1839                ll = l;
1840        }
1841
1842        if (ll && hlist_empty(&ll->list))
1843                trie_leaf_remove(t, ll);
1844
1845        pr_debug("trie_flush found=%d\n", found);
1846        return found;
1847}
1848
1849void fib_free_table(struct fib_table *tb)
1850{
1851        kfree(tb);
1852}
1853
1854static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1855                           struct fib_table *tb,
1856                           struct sk_buff *skb, struct netlink_callback *cb)
1857{
1858        int i, s_i;
1859        struct fib_alias *fa;
1860        __be32 xkey = htonl(key);
1861
1862        s_i = cb->args[5];
1863        i = 0;
1864
1865        /* rcu_read_lock is hold by caller */
1866
1867        list_for_each_entry_rcu(fa, fah, fa_list) {
1868                if (i < s_i) {
1869                        i++;
1870                        continue;
1871                }
1872
1873                if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1874                                  cb->nlh->nlmsg_seq,
1875                                  RTM_NEWROUTE,
1876                                  tb->tb_id,
1877                                  fa->fa_type,
1878                                  xkey,
1879                                  plen,
1880                                  fa->fa_tos,
1881                                  fa->fa_info, NLM_F_MULTI) < 0) {
1882                        cb->args[5] = i;
1883                        return -1;
1884                }
1885                i++;
1886        }
1887        cb->args[5] = i;
1888        return skb->len;
1889}
1890
1891static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1892                        struct sk_buff *skb, struct netlink_callback *cb)
1893{
1894        struct leaf_info *li;
1895        struct hlist_node *node;
1896        int i, s_i;
1897
1898        s_i = cb->args[4];
1899        i = 0;
1900
1901        /* rcu_read_lock is hold by caller */
1902        hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1903                if (i < s_i) {
1904                        i++;
1905                        continue;
1906                }
1907
1908                if (i > s_i)
1909                        cb->args[5] = 0;
1910
1911                if (list_empty(&li->falh))
1912                        continue;
1913
1914                if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1915                        cb->args[4] = i;
1916                        return -1;
1917                }
1918                i++;
1919        }
1920
1921        cb->args[4] = i;
1922        return skb->len;
1923}
1924
1925int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1926                   struct netlink_callback *cb)
1927{
1928        struct leaf *l;
1929        struct trie *t = (struct trie *) tb->tb_data;
1930        t_key key = cb->args[2];
1931        int count = cb->args[3];
1932
1933        rcu_read_lock();
1934        /* Dump starting at last key.
1935         * Note: 0.0.0.0/0 (ie default) is first key.
1936         */
1937        if (count == 0)
1938                l = trie_firstleaf(t);
1939        else {
1940                /* Normally, continue from last key, but if that is missing
1941                 * fallback to using slow rescan
1942                 */
1943                l = fib_find_node(t, key);
1944                if (!l)
1945                        l = trie_leafindex(t, count);
1946        }
1947
1948        while (l) {
1949                cb->args[2] = l->key;
1950                if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1951                        cb->args[3] = count;
1952                        rcu_read_unlock();
1953                        return -1;
1954                }
1955
1956                ++count;
1957                l = trie_nextleaf(l);
1958                memset(&cb->args[4], 0,
1959                       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1960        }
1961        cb->args[3] = count;
1962        rcu_read_unlock();
1963
1964        return skb->len;
1965}
1966
1967void __init fib_trie_init(void)
1968{
1969        fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1970                                          sizeof(struct fib_alias),
1971                                          0, SLAB_PANIC, NULL);
1972
1973        trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1974                                           max(sizeof(struct leaf),
1975                                               sizeof(struct leaf_info)),
1976                                           0, SLAB_PANIC, NULL);
1977}
1978
1979
1980struct fib_table *fib_trie_table(u32 id)
1981{
1982        struct fib_table *tb;
1983        struct trie *t;
1984
1985        tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1986                     GFP_KERNEL);
1987        if (tb == NULL)
1988                return NULL;
1989
1990        tb->tb_id = id;
1991        tb->tb_default = -1;
1992        tb->tb_num_default = 0;
1993
1994        t = (struct trie *) tb->tb_data;
1995        memset(t, 0, sizeof(*t));
1996
1997        return tb;
1998}
1999
2000#ifdef CONFIG_PROC_FS
2001/* Depth first Trie walk iterator */
2002struct fib_trie_iter {
2003        struct seq_net_private p;
2004        struct fib_table *tb;
2005        struct tnode *tnode;
2006        unsigned int index;
2007        unsigned int depth;
2008};
2009
2010static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
2011{
2012        struct tnode *tn = iter->tnode;
2013        unsigned int cindex = iter->index;
2014        struct tnode *p;
2015
2016        /* A single entry routing table */
2017        if (!tn)
2018                return NULL;
2019
2020        pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2021                 iter->tnode, iter->index, iter->depth);
2022rescan:
2023        while (cindex < (1<<tn->bits)) {
2024                struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
2025
2026                if (n) {
2027                        if (IS_LEAF(n)) {
2028                                iter->tnode = tn;
2029                                iter->index = cindex + 1;
2030                        } else {
2031                                /* push down one level */
2032                                iter->tnode = (struct tnode *) n;
2033                                iter->index = 0;
2034                                ++iter->depth;
2035                        }
2036                        return n;
2037                }
2038
2039                ++cindex;
2040        }
2041
2042        /* Current node exhausted, pop back up */
2043        p = node_parent_rcu((struct rt_trie_node *)tn);
2044        if (p) {
2045                cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2046                tn = p;
2047                --iter->depth;
2048                goto rescan;
2049        }
2050
2051        /* got root? */
2052        return NULL;
2053}
2054
2055static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2056                                       struct trie *t)
2057{
2058        struct rt_trie_node *n;
2059
2060        if (!t)
2061                return NULL;
2062
2063        n = rcu_dereference(t->trie);
2064        if (!n)
2065                return NULL;
2066
2067        if (IS_TNODE(n)) {
2068                iter->tnode = (struct tnode *) n;
2069                iter->index = 0;
2070                iter->depth = 1;
2071        } else {
2072                iter->tnode = NULL;
2073                iter->index = 0;
2074                iter->depth = 0;
2075        }
2076
2077        return n;
2078}
2079
2080static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2081{
2082        struct rt_trie_node *n;
2083        struct fib_trie_iter iter;
2084
2085        memset(s, 0, sizeof(*s));
2086
2087        rcu_read_lock();
2088        for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2089                if (IS_LEAF(n)) {
2090                        struct leaf *l = (struct leaf *)n;
2091                        struct leaf_info *li;
2092                        struct hlist_node *tmp;
2093
2094                        s->leaves++;
2095                        s->totdepth += iter.depth;
2096                        if (iter.depth > s->maxdepth)
2097                                s->maxdepth = iter.depth;
2098
2099                        hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2100                                ++s->prefixes;
2101                } else {
2102                        const struct tnode *tn = (const struct tnode *) n;
2103                        int i;
2104
2105                        s->tnodes++;
2106                        if (tn->bits < MAX_STAT_DEPTH)
2107                                s->nodesizes[tn->bits]++;
2108
2109                        for (i = 0; i < (1<<tn->bits); i++)
2110                                if (!tn->child[i])
2111                                        s->nullpointers++;
2112                }
2113        }
2114        rcu_read_unlock();
2115}
2116
2117/*
2118 *      This outputs /proc/net/fib_triestats
2119 */
2120static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2121{
2122        unsigned int i, max, pointers, bytes, avdepth;
2123
2124        if (stat->leaves)
2125                avdepth = stat->totdepth*100 / stat->leaves;
2126        else
2127                avdepth = 0;
2128
2129        seq_printf(seq, "\tAver depth:     %u.%02d\n",
2130                   avdepth / 100, avdepth % 100);
2131        seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2132
2133        seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2134        bytes = sizeof(struct leaf) * stat->leaves;
2135
2136        seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2137        bytes += sizeof(struct leaf_info) * stat->prefixes;
2138
2139        seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2140        bytes += sizeof(struct tnode) * stat->tnodes;
2141
2142        max = MAX_STAT_DEPTH;
2143        while (max > 0 && stat->nodesizes[max-1] == 0)
2144                max--;
2145
2146        pointers = 0;
2147        for (i = 1; i <= max; i++)
2148                if (stat->nodesizes[i] != 0) {
2149                        seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2150                        pointers += (1<<i) * stat->nodesizes[i];
2151                }
2152        seq_putc(seq, '\n');
2153        seq_printf(seq, "\tPointers: %u\n", pointers);
2154
2155        bytes += sizeof(struct rt_trie_node *) * pointers;
2156        seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2157        seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2158}
2159
2160#ifdef CONFIG_IP_FIB_TRIE_STATS
2161static void trie_show_usage(struct seq_file *seq,
2162                            const struct trie_use_stats *stats)
2163{
2164        seq_printf(seq, "\nCounters:\n---------\n");
2165        seq_printf(seq, "gets = %u\n", stats->gets);
2166        seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2167        seq_printf(seq, "semantic match passed = %u\n",
2168                   stats->semantic_match_passed);
2169        seq_printf(seq, "semantic match miss = %u\n",
2170                   stats->semantic_match_miss);
2171        seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2172        seq_printf(seq, "skipped node resize = %u\n\n",
2173                   stats->resize_node_skipped);
2174}
2175#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2176
2177static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2178{
2179        if (tb->tb_id == RT_TABLE_LOCAL)
2180                seq_puts(seq, "Local:\n");
2181        else if (tb->tb_id == RT_TABLE_MAIN)
2182                seq_puts(seq, "Main:\n");
2183        else
2184                seq_printf(seq, "Id %d:\n", tb->tb_id);
2185}
2186
2187
2188static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2189{
2190        struct net *net = (struct net *)seq->private;
2191        unsigned int h;
2192
2193        seq_printf(seq,
2194                   "Basic info: size of leaf:"
2195                   " %Zd bytes, size of tnode: %Zd bytes.\n",
2196                   sizeof(struct leaf), sizeof(struct tnode));
2197
2198        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2199                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2200                struct hlist_node *node;
2201                struct fib_table *tb;
2202
2203                hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2204                        struct trie *t = (struct trie *) tb->tb_data;
2205                        struct trie_stat stat;
2206
2207                        if (!t)
2208                                continue;
2209
2210                        fib_table_print(seq, tb);
2211
2212                        trie_collect_stats(t, &stat);
2213                        trie_show_stats(seq, &stat);
2214#ifdef CONFIG_IP_FIB_TRIE_STATS
2215                        trie_show_usage(seq, &t->stats);
2216#endif
2217                }
2218        }
2219
2220        return 0;
2221}
2222
2223static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2224{
2225        return single_open_net(inode, file, fib_triestat_seq_show);
2226}
2227
2228static const struct file_operations fib_triestat_fops = {
2229        .owner  = THIS_MODULE,
2230        .open   = fib_triestat_seq_open,
2231        .read   = seq_read,
2232        .llseek = seq_lseek,
2233        .release = single_release_net,
2234};
2235
2236static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2237{
2238        struct fib_trie_iter *iter = seq->private;
2239        struct net *net = seq_file_net(seq);
2240        loff_t idx = 0;
2241        unsigned int h;
2242
2243        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2244                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2245                struct hlist_node *node;
2246                struct fib_table *tb;
2247
2248                hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2249                        struct rt_trie_node *n;
2250
2251                        for (n = fib_trie_get_first(iter,
2252                                                    (struct trie *) tb->tb_data);
2253                             n; n = fib_trie_get_next(iter))
2254                                if (pos == idx++) {
2255                                        iter->tb = tb;
2256                                        return n;
2257                                }
2258                }
2259        }
2260
2261        return NULL;
2262}
2263
2264static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2265        __acquires(RCU)
2266{
2267        rcu_read_lock();
2268        return fib_trie_get_idx(seq, *pos);
2269}
2270
2271static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2272{
2273        struct fib_trie_iter *iter = seq->private;
2274        struct net *net = seq_file_net(seq);
2275        struct fib_table *tb = iter->tb;
2276        struct hlist_node *tb_node;
2277        unsigned int h;
2278        struct rt_trie_node *n;
2279
2280        ++*pos;
2281        /* next node in same table */
2282        n = fib_trie_get_next(iter);
2283        if (n)
2284                return n;
2285
2286        /* walk rest of this hash chain */
2287        h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2288        while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2289                tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2290                n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2291                if (n)
2292                        goto found;
2293        }
2294
2295        /* new hash chain */
2296        while (++h < FIB_TABLE_HASHSZ) {
2297                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2298                hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2299                        n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2300                        if (n)
2301                                goto found;
2302                }
2303        }
2304        return NULL;
2305
2306found:
2307        iter->tb = tb;
2308        return n;
2309}
2310
2311static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2312        __releases(RCU)
2313{
2314        rcu_read_unlock();
2315}
2316
2317static void seq_indent(struct seq_file *seq, int n)
2318{
2319        while (n-- > 0)
2320                seq_puts(seq, "   ");
2321}
2322
2323static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2324{
2325        switch (s) {
2326        case RT_SCOPE_UNIVERSE: return "universe";
2327        case RT_SCOPE_SITE:     return "site";
2328        case RT_SCOPE_LINK:     return "link";
2329        case RT_SCOPE_HOST:     return "host";
2330        case RT_SCOPE_NOWHERE:  return "nowhere";
2331        default:
2332                snprintf(buf, len, "scope=%d", s);
2333                return buf;
2334        }
2335}
2336
2337static const char *const rtn_type_names[__RTN_MAX] = {
2338        [RTN_UNSPEC] = "UNSPEC",
2339        [RTN_UNICAST] = "UNICAST",
2340        [RTN_LOCAL] = "LOCAL",
2341        [RTN_BROADCAST] = "BROADCAST",
2342        [RTN_ANYCAST] = "ANYCAST",
2343        [RTN_MULTICAST] = "MULTICAST",
2344        [RTN_BLACKHOLE] = "BLACKHOLE",
2345        [RTN_UNREACHABLE] = "UNREACHABLE",
2346        [RTN_PROHIBIT] = "PROHIBIT",
2347        [RTN_THROW] = "THROW",
2348        [RTN_NAT] = "NAT",
2349        [RTN_XRESOLVE] = "XRESOLVE",
2350};
2351
2352static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2353{
2354        if (t < __RTN_MAX && rtn_type_names[t])
2355                return rtn_type_names[t];
2356        snprintf(buf, len, "type %u", t);
2357        return buf;
2358}
2359
2360/* Pretty print the trie */
2361static int fib_trie_seq_show(struct seq_file *seq, void *v)
2362{
2363        const struct fib_trie_iter *iter = seq->private;
2364        struct rt_trie_node *n = v;
2365
2366        if (!node_parent_rcu(n))
2367                fib_table_print(seq, iter->tb);
2368
2369        if (IS_TNODE(n)) {
2370                struct tnode *tn = (struct tnode *) n;
2371                __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2372
2373                seq_indent(seq, iter->depth-1);
2374                seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
2375                           &prf, tn->pos, tn->bits, tn->full_children,
2376                           tn->empty_children);
2377
2378        } else {
2379                struct leaf *l = (struct leaf *) n;
2380                struct leaf_info *li;
2381                struct hlist_node *node;
2382                __be32 val = htonl(l->key);
2383
2384                seq_indent(seq, iter->depth);
2385                seq_printf(seq, "  |-- %pI4\n", &val);
2386
2387                hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2388                        struct fib_alias *fa;
2389
2390                        list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2391                                char buf1[32], buf2[32];
2392
2393                                seq_indent(seq, iter->depth+1);
2394                                seq_printf(seq, "  /%d %s %s", li->plen,
2395                                           rtn_scope(buf1, sizeof(buf1),
2396                                                     fa->fa_info->fib_scope),
2397                                           rtn_type(buf2, sizeof(buf2),
2398                                                    fa->fa_type));
2399                                if (fa->fa_tos)
2400                                        seq_printf(seq, " tos=%d", fa->fa_tos);
2401                                seq_putc(seq, '\n');
2402                        }
2403                }
2404        }
2405
2406        return 0;
2407}
2408
2409static const struct seq_operations fib_trie_seq_ops = {
2410        .start  = fib_trie_seq_start,
2411        .next   = fib_trie_seq_next,
2412        .stop   = fib_trie_seq_stop,
2413        .show   = fib_trie_seq_show,
2414};
2415
2416static int fib_trie_seq_open(struct inode *inode, struct file *file)
2417{
2418        return seq_open_net(inode, file, &fib_trie_seq_ops,
2419                            sizeof(struct fib_trie_iter));
2420}
2421
2422static const struct file_operations fib_trie_fops = {
2423        .owner  = THIS_MODULE,
2424        .open   = fib_trie_seq_open,
2425        .read   = seq_read,
2426        .llseek = seq_lseek,
2427        .release = seq_release_net,
2428};
2429
2430struct fib_route_iter {
2431        struct seq_net_private p;
2432        struct trie *main_trie;
2433        loff_t  pos;
2434        t_key   key;
2435};
2436
2437static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2438{
2439        struct leaf *l = NULL;
2440        struct trie *t = iter->main_trie;
2441
2442        /* use cache location of last found key */
2443        if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2444                pos -= iter->pos;
2445        else {
2446                iter->pos = 0;
2447                l = trie_firstleaf(t);
2448        }
2449
2450        while (l && pos-- > 0) {
2451                iter->pos++;
2452                l = trie_nextleaf(l);
2453        }
2454
2455        if (l)
2456                iter->key = pos;        /* remember it */
2457        else
2458                iter->pos = 0;          /* forget it */
2459
2460        return l;
2461}
2462
2463static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2464        __acquires(RCU)
2465{
2466        struct fib_route_iter *iter = seq->private;
2467        struct fib_table *tb;
2468
2469        rcu_read_lock();
2470        tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2471        if (!tb)
2472                return NULL;
2473
2474        iter->main_trie = (struct trie *) tb->tb_data;
2475        if (*pos == 0)
2476                return SEQ_START_TOKEN;
2477        else
2478                return fib_route_get_idx(iter, *pos - 1);
2479}
2480
2481static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2482{
2483        struct fib_route_iter *iter = seq->private;
2484        struct leaf *l = v;
2485
2486        ++*pos;
2487        if (v == SEQ_START_TOKEN) {
2488                iter->pos = 0;
2489                l = trie_firstleaf(iter->main_trie);
2490        } else {
2491                iter->pos++;
2492                l = trie_nextleaf(l);
2493        }
2494
2495        if (l)
2496                iter->key = l->key;
2497        else
2498                iter->pos = 0;
2499        return l;
2500}
2501
2502static void fib_route_seq_stop(struct seq_file *seq, void *v)
2503        __releases(RCU)
2504{
2505        rcu_read_unlock();
2506}
2507
2508static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2509{
2510        unsigned int flags = 0;
2511
2512        if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2513                flags = RTF_REJECT;
2514        if (fi && fi->fib_nh->nh_gw)
2515                flags |= RTF_GATEWAY;
2516        if (mask == htonl(0xFFFFFFFF))
2517                flags |= RTF_HOST;
2518        flags |= RTF_UP;
2519        return flags;
2520}
2521
2522/*
2523 *      This outputs /proc/net/route.
2524 *      The format of the file is not supposed to be changed
2525 *      and needs to be same as fib_hash output to avoid breaking
2526 *      legacy utilities
2527 */
2528static int fib_route_seq_show(struct seq_file *seq, void *v)
2529{
2530        struct leaf *l = v;
2531        struct leaf_info *li;
2532        struct hlist_node *node;
2533
2534        if (v == SEQ_START_TOKEN) {
2535                seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2536                           "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2537                           "\tWindow\tIRTT");
2538                return 0;
2539        }
2540
2541        hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2542                struct fib_alias *fa;
2543                __be32 mask, prefix;
2544
2545                mask = inet_make_mask(li->plen);
2546                prefix = htonl(l->key);
2547
2548                list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2549                        const struct fib_info *fi = fa->fa_info;
2550                        unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2551                        int len;
2552
2553                        if (fa->fa_type == RTN_BROADCAST
2554                            || fa->fa_type == RTN_MULTICAST)
2555                                continue;
2556
2557                        if (fi)
2558                                seq_printf(seq,
2559                                         "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2560                                         "%d\t%08X\t%d\t%u\t%u%n",
2561                                         fi->fib_dev ? fi->fib_dev->name : "*",
2562                                         prefix,
2563                                         fi->fib_nh->nh_gw, flags, 0, 0,
2564                                         fi->fib_priority,
2565                                         mask,
2566                                         (fi->fib_advmss ?
2567                                          fi->fib_advmss + 40 : 0),
2568                                         fi->fib_window,
2569                                         fi->fib_rtt >> 3, &len);
2570                        else
2571                                seq_printf(seq,
2572                                         "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2573                                         "%d\t%08X\t%d\t%u\t%u%n",
2574                                         prefix, 0, flags, 0, 0, 0,
2575                                         mask, 0, 0, 0, &len);
2576
2577                        seq_printf(seq, "%*s\n", 127 - len, "");
2578                }
2579        }
2580
2581        return 0;
2582}
2583
2584static const struct seq_operations fib_route_seq_ops = {
2585        .start  = fib_route_seq_start,
2586        .next   = fib_route_seq_next,
2587        .stop   = fib_route_seq_stop,
2588        .show   = fib_route_seq_show,
2589};
2590
2591static int fib_route_seq_open(struct inode *inode, struct file *file)
2592{
2593        return seq_open_net(inode, file, &fib_route_seq_ops,
2594                            sizeof(struct fib_route_iter));
2595}
2596
2597static const struct file_operations fib_route_fops = {
2598        .owner  = THIS_MODULE,
2599        .open   = fib_route_seq_open,
2600        .read   = seq_read,
2601        .llseek = seq_lseek,
2602        .release = seq_release_net,
2603};
2604
2605int __net_init fib_proc_init(struct net *net)
2606{
2607        if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2608                goto out1;
2609
2610        if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2611                                  &fib_triestat_fops))
2612                goto out2;
2613
2614        if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2615                goto out3;
2616
2617        return 0;
2618
2619out3:
2620        proc_net_remove(net, "fib_triestat");
2621out2:
2622        proc_net_remove(net, "fib_trie");
2623out1:
2624        return -ENOMEM;
2625}
2626
2627void __net_exit fib_proc_exit(struct net *net)
2628{
2629        proc_net_remove(net, "fib_trie");
2630        proc_net_remove(net, "fib_triestat");
2631        proc_net_remove(net, "route");
2632}
2633
2634#endif /* CONFIG_PROC_FS */
2635
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.