linux/net/ipv4/fib_trie.c
<<
>>
Prefs
   1/*
   2 *   This program is free software; you can redistribute it and/or
   3 *   modify it under the terms of the GNU General Public License
   4 *   as published by the Free Software Foundation; either version
   5 *   2 of the License, or (at your option) any later version.
   6 *
   7 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
   8 *     & Swedish University of Agricultural Sciences.
   9 *
  10 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
  11 *     Agricultural Sciences.
  12 *
  13 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
  14 *
  15 * This work is based on the LPC-trie which is originally described in:
  16 *
  17 * An experimental study of compression methods for dynamic tries
  18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
  19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
  20 *
  21 *
  22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
  23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
  24 *
  25 *
  26 * Code from fib_hash has been reused which includes the following header:
  27 *
  28 *
  29 * INET         An implementation of the TCP/IP protocol suite for the LINUX
  30 *              operating system.  INET is implemented using the  BSD Socket
  31 *              interface as the means of communication with the user level.
  32 *
  33 *              IPv4 FIB: lookup engine and maintenance routines.
  34 *
  35 *
  36 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  37 *
  38 *              This program is free software; you can redistribute it and/or
  39 *              modify it under the terms of the GNU General Public License
  40 *              as published by the Free Software Foundation; either version
  41 *              2 of the License, or (at your option) any later version.
  42 *
  43 * Substantial contributions to this work comes from:
  44 *
  45 *              David S. Miller, <davem@davemloft.net>
  46 *              Stephen Hemminger <shemminger@osdl.org>
  47 *              Paul E. McKenney <paulmck@us.ibm.com>
  48 *              Patrick McHardy <kaber@trash.net>
  49 */
  50
  51#define VERSION "0.409"
  52
  53#include <asm/uaccess.h>
  54#include <asm/system.h>
  55#include <linux/bitops.h>
  56#include <linux/types.h>
  57#include <linux/kernel.h>
  58#include <linux/mm.h>
  59#include <linux/string.h>
  60#include <linux/socket.h>
  61#include <linux/sockios.h>
  62#include <linux/errno.h>
  63#include <linux/in.h>
  64#include <linux/inet.h>
  65#include <linux/inetdevice.h>
  66#include <linux/netdevice.h>
  67#include <linux/if_arp.h>
  68#include <linux/proc_fs.h>
  69#include <linux/rcupdate.h>
  70#include <linux/skbuff.h>
  71#include <linux/netlink.h>
  72#include <linux/init.h>
  73#include <linux/list.h>
  74#include <linux/slab.h>
  75#include <linux/prefetch.h>
  76#include <linux/export.h>
  77#include <net/net_namespace.h>
  78#include <net/ip.h>
  79#include <net/protocol.h>
  80#include <net/route.h>
  81#include <net/tcp.h>
  82#include <net/sock.h>
  83#include <net/ip_fib.h>
  84#include "fib_lookup.h"
  85
  86#define MAX_STAT_DEPTH 32
  87
  88#define KEYLENGTH (8*sizeof(t_key))
  89
  90typedef unsigned int t_key;
  91
  92#define T_TNODE 0
  93#define T_LEAF  1
  94#define NODE_TYPE_MASK  0x1UL
  95#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
  96
  97#define IS_TNODE(n) (!(n->parent & T_LEAF))
  98#define IS_LEAF(n) (n->parent & T_LEAF)
  99
 100struct rt_trie_node {
 101        unsigned long parent;
 102        t_key key;
 103};
 104
 105struct leaf {
 106        unsigned long parent;
 107        t_key key;
 108        struct hlist_head list;
 109        struct rcu_head rcu;
 110};
 111
 112struct leaf_info {
 113        struct hlist_node hlist;
 114        int plen;
 115        u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
 116        struct list_head falh;
 117        struct rcu_head rcu;
 118};
 119
 120struct tnode {
 121        unsigned long parent;
 122        t_key key;
 123        unsigned char pos;              /* 2log(KEYLENGTH) bits needed */
 124        unsigned char bits;             /* 2log(KEYLENGTH) bits needed */
 125        unsigned int full_children;     /* KEYLENGTH bits needed */
 126        unsigned int empty_children;    /* KEYLENGTH bits needed */
 127        union {
 128                struct rcu_head rcu;
 129                struct work_struct work;
 130                struct tnode *tnode_free;
 131        };
 132        struct rt_trie_node __rcu *child[0];
 133};
 134
 135#ifdef CONFIG_IP_FIB_TRIE_STATS
 136struct trie_use_stats {
 137        unsigned int gets;
 138        unsigned int backtrack;
 139        unsigned int semantic_match_passed;
 140        unsigned int semantic_match_miss;
 141        unsigned int null_node_hit;
 142        unsigned int resize_node_skipped;
 143};
 144#endif
 145
 146struct trie_stat {
 147        unsigned int totdepth;
 148        unsigned int maxdepth;
 149        unsigned int tnodes;
 150        unsigned int leaves;
 151        unsigned int nullpointers;
 152        unsigned int prefixes;
 153        unsigned int nodesizes[MAX_STAT_DEPTH];
 154};
 155
 156struct trie {
 157        struct rt_trie_node __rcu *trie;
 158#ifdef CONFIG_IP_FIB_TRIE_STATS
 159        struct trie_use_stats stats;
 160#endif
 161};
 162
 163static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n);
 164static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
 165                                  int wasfull);
 166static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
 167static struct tnode *inflate(struct trie *t, struct tnode *tn);
 168static struct tnode *halve(struct trie *t, struct tnode *tn);
 169/* tnodes to free after resize(); protected by RTNL */
 170static struct tnode *tnode_free_head;
 171static size_t tnode_free_size;
 172
 173/*
 174 * synchronize_rcu after call_rcu for that many pages; it should be especially
 175 * useful before resizing the root node with PREEMPT_NONE configs; the value was
 176 * obtained experimentally, aiming to avoid visible slowdown.
 177 */
 178static const int sync_pages = 128;
 179
 180static struct kmem_cache *fn_alias_kmem __read_mostly;
 181static struct kmem_cache *trie_leaf_kmem __read_mostly;
 182
 183/*
 184 * caller must hold RTNL
 185 */
 186static inline struct tnode *node_parent(const struct rt_trie_node *node)
 187{
 188        unsigned long parent;
 189
 190        parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
 191
 192        return (struct tnode *)(parent & ~NODE_TYPE_MASK);
 193}
 194
 195/*
 196 * caller must hold RCU read lock or RTNL
 197 */
 198static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
 199{
 200        unsigned long parent;
 201
 202        parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
 203                                                           lockdep_rtnl_is_held());
 204
 205        return (struct tnode *)(parent & ~NODE_TYPE_MASK);
 206}
 207
 208/* Same as rcu_assign_pointer
 209 * but that macro() assumes that value is a pointer.
 210 */
 211static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
 212{
 213        smp_wmb();
 214        node->parent = (unsigned long)ptr | NODE_TYPE(node);
 215}
 216
 217/*
 218 * caller must hold RTNL
 219 */
 220static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
 221{
 222        BUG_ON(i >= 1U << tn->bits);
 223
 224        return rtnl_dereference(tn->child[i]);
 225}
 226
 227/*
 228 * caller must hold RCU read lock or RTNL
 229 */
 230static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
 231{
 232        BUG_ON(i >= 1U << tn->bits);
 233
 234        return rcu_dereference_rtnl(tn->child[i]);
 235}
 236
 237static inline int tnode_child_length(const struct tnode *tn)
 238{
 239        return 1 << tn->bits;
 240}
 241
 242static inline t_key mask_pfx(t_key k, unsigned int l)
 243{
 244        return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
 245}
 246
 247static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
 248{
 249        if (offset < KEYLENGTH)
 250                return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
 251        else
 252                return 0;
 253}
 254
 255static inline int tkey_equals(t_key a, t_key b)
 256{
 257        return a == b;
 258}
 259
 260static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
 261{
 262        if (bits == 0 || offset >= KEYLENGTH)
 263                return 1;
 264        bits = bits > KEYLENGTH ? KEYLENGTH : bits;
 265        return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
 266}
 267
 268static inline int tkey_mismatch(t_key a, int offset, t_key b)
 269{
 270        t_key diff = a ^ b;
 271        int i = offset;
 272
 273        if (!diff)
 274                return 0;
 275        while ((diff << i) >> (KEYLENGTH-1) == 0)
 276                i++;
 277        return i;
 278}
 279
 280/*
 281  To understand this stuff, an understanding of keys and all their bits is
 282  necessary. Every node in the trie has a key associated with it, but not
 283  all of the bits in that key are significant.
 284
 285  Consider a node 'n' and its parent 'tp'.
 286
 287  If n is a leaf, every bit in its key is significant. Its presence is
 288  necessitated by path compression, since during a tree traversal (when
 289  searching for a leaf - unless we are doing an insertion) we will completely
 290  ignore all skipped bits we encounter. Thus we need to verify, at the end of
 291  a potentially successful search, that we have indeed been walking the
 292  correct key path.
 293
 294  Note that we can never "miss" the correct key in the tree if present by
 295  following the wrong path. Path compression ensures that segments of the key
 296  that are the same for all keys with a given prefix are skipped, but the
 297  skipped part *is* identical for each node in the subtrie below the skipped
 298  bit! trie_insert() in this implementation takes care of that - note the
 299  call to tkey_sub_equals() in trie_insert().
 300
 301  if n is an internal node - a 'tnode' here, the various parts of its key
 302  have many different meanings.
 303
 304  Example:
 305  _________________________________________________________________
 306  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
 307  -----------------------------------------------------------------
 308    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 309
 310  _________________________________________________________________
 311  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
 312  -----------------------------------------------------------------
 313   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
 314
 315  tp->pos = 7
 316  tp->bits = 3
 317  n->pos = 15
 318  n->bits = 4
 319
 320  First, let's just ignore the bits that come before the parent tp, that is
 321  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
 322  not use them for anything.
 323
 324  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
 325  index into the parent's child array. That is, they will be used to find
 326  'n' among tp's children.
 327
 328  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
 329  for the node n.
 330
 331  All the bits we have seen so far are significant to the node n. The rest
 332  of the bits are really not needed or indeed known in n->key.
 333
 334  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
 335  n's child array, and will of course be different for each child.
 336
 337
 338  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
 339  at this point.
 340
 341*/
 342
 343static inline void check_tnode(const struct tnode *tn)
 344{
 345        WARN_ON(tn && tn->pos+tn->bits > 32);
 346}
 347
 348static const int halve_threshold = 25;
 349static const int inflate_threshold = 50;
 350static const int halve_threshold_root = 15;
 351static const int inflate_threshold_root = 30;
 352
 353static void __alias_free_mem(struct rcu_head *head)
 354{
 355        struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
 356        kmem_cache_free(fn_alias_kmem, fa);
 357}
 358
 359static inline void alias_free_mem_rcu(struct fib_alias *fa)
 360{
 361        call_rcu(&fa->rcu, __alias_free_mem);
 362}
 363
 364static void __leaf_free_rcu(struct rcu_head *head)
 365{
 366        struct leaf *l = container_of(head, struct leaf, rcu);
 367        kmem_cache_free(trie_leaf_kmem, l);
 368}
 369
 370static inline void free_leaf(struct leaf *l)
 371{
 372        call_rcu_bh(&l->rcu, __leaf_free_rcu);
 373}
 374
 375static inline void free_leaf_info(struct leaf_info *leaf)
 376{
 377        kfree_rcu(leaf, rcu);
 378}
 379
 380static struct tnode *tnode_alloc(size_t size)
 381{
 382        if (size <= PAGE_SIZE)
 383                return kzalloc(size, GFP_KERNEL);
 384        else
 385                return vzalloc(size);
 386}
 387
 388static void __tnode_vfree(struct work_struct *arg)
 389{
 390        struct tnode *tn = container_of(arg, struct tnode, work);
 391        vfree(tn);
 392}
 393
 394static void __tnode_free_rcu(struct rcu_head *head)
 395{
 396        struct tnode *tn = container_of(head, struct tnode, rcu);
 397        size_t size = sizeof(struct tnode) +
 398                      (sizeof(struct rt_trie_node *) << tn->bits);
 399
 400        if (size <= PAGE_SIZE)
 401                kfree(tn);
 402        else {
 403                INIT_WORK(&tn->work, __tnode_vfree);
 404                schedule_work(&tn->work);
 405        }
 406}
 407
 408static inline void tnode_free(struct tnode *tn)
 409{
 410        if (IS_LEAF(tn))
 411                free_leaf((struct leaf *) tn);
 412        else
 413                call_rcu(&tn->rcu, __tnode_free_rcu);
 414}
 415
 416static void tnode_free_safe(struct tnode *tn)
 417{
 418        BUG_ON(IS_LEAF(tn));
 419        tn->tnode_free = tnode_free_head;
 420        tnode_free_head = tn;
 421        tnode_free_size += sizeof(struct tnode) +
 422                           (sizeof(struct rt_trie_node *) << tn->bits);
 423}
 424
 425static void tnode_free_flush(void)
 426{
 427        struct tnode *tn;
 428
 429        while ((tn = tnode_free_head)) {
 430                tnode_free_head = tn->tnode_free;
 431                tn->tnode_free = NULL;
 432                tnode_free(tn);
 433        }
 434
 435        if (tnode_free_size >= PAGE_SIZE * sync_pages) {
 436                tnode_free_size = 0;
 437                synchronize_rcu();
 438        }
 439}
 440
 441static struct leaf *leaf_new(void)
 442{
 443        struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
 444        if (l) {
 445                l->parent = T_LEAF;
 446                INIT_HLIST_HEAD(&l->list);
 447        }
 448        return l;
 449}
 450
 451static struct leaf_info *leaf_info_new(int plen)
 452{
 453        struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
 454        if (li) {
 455                li->plen = plen;
 456                li->mask_plen = ntohl(inet_make_mask(plen));
 457                INIT_LIST_HEAD(&li->falh);
 458        }
 459        return li;
 460}
 461
 462static struct tnode *tnode_new(t_key key, int pos, int bits)
 463{
 464        size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
 465        struct tnode *tn = tnode_alloc(sz);
 466
 467        if (tn) {
 468                tn->parent = T_TNODE;
 469                tn->pos = pos;
 470                tn->bits = bits;
 471                tn->key = key;
 472                tn->full_children = 0;
 473                tn->empty_children = 1<<bits;
 474        }
 475
 476        pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
 477                 sizeof(struct rt_trie_node) << bits);
 478        return tn;
 479}
 480
 481/*
 482 * Check whether a tnode 'n' is "full", i.e. it is an internal node
 483 * and no bits are skipped. See discussion in dyntree paper p. 6
 484 */
 485
 486static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
 487{
 488        if (n == NULL || IS_LEAF(n))
 489                return 0;
 490
 491        return ((struct tnode *) n)->pos == tn->pos + tn->bits;
 492}
 493
 494static inline void put_child(struct trie *t, struct tnode *tn, int i,
 495                             struct rt_trie_node *n)
 496{
 497        tnode_put_child_reorg(tn, i, n, -1);
 498}
 499
 500 /*
 501  * Add a child at position i overwriting the old value.
 502  * Update the value of full_children and empty_children.
 503  */
 504
 505static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
 506                                  int wasfull)
 507{
 508        struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
 509        int isfull;
 510
 511        BUG_ON(i >= 1<<tn->bits);
 512
 513        /* update emptyChildren */
 514        if (n == NULL && chi != NULL)
 515                tn->empty_children++;
 516        else if (n != NULL && chi == NULL)
 517                tn->empty_children--;
 518
 519        /* update fullChildren */
 520        if (wasfull == -1)
 521                wasfull = tnode_full(tn, chi);
 522
 523        isfull = tnode_full(tn, n);
 524        if (wasfull && !isfull)
 525                tn->full_children--;
 526        else if (!wasfull && isfull)
 527                tn->full_children++;
 528
 529        if (n)
 530                node_set_parent(n, tn);
 531
 532        rcu_assign_pointer(tn->child[i], n);
 533}
 534
 535#define MAX_WORK 10
 536static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
 537{
 538        int i;
 539        struct tnode *old_tn;
 540        int inflate_threshold_use;
 541        int halve_threshold_use;
 542        int max_work;
 543
 544        if (!tn)
 545                return NULL;
 546
 547        pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
 548                 tn, inflate_threshold, halve_threshold);
 549
 550        /* No children */
 551        if (tn->empty_children == tnode_child_length(tn)) {
 552                tnode_free_safe(tn);
 553                return NULL;
 554        }
 555        /* One child */
 556        if (tn->empty_children == tnode_child_length(tn) - 1)
 557                goto one_child;
 558        /*
 559         * Double as long as the resulting node has a number of
 560         * nonempty nodes that are above the threshold.
 561         */
 562
 563        /*
 564         * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
 565         * the Helsinki University of Technology and Matti Tikkanen of Nokia
 566         * Telecommunications, page 6:
 567         * "A node is doubled if the ratio of non-empty children to all
 568         * children in the *doubled* node is at least 'high'."
 569         *
 570         * 'high' in this instance is the variable 'inflate_threshold'. It
 571         * is expressed as a percentage, so we multiply it with
 572         * tnode_child_length() and instead of multiplying by 2 (since the
 573         * child array will be doubled by inflate()) and multiplying
 574         * the left-hand side by 100 (to handle the percentage thing) we
 575         * multiply the left-hand side by 50.
 576         *
 577         * The left-hand side may look a bit weird: tnode_child_length(tn)
 578         * - tn->empty_children is of course the number of non-null children
 579         * in the current node. tn->full_children is the number of "full"
 580         * children, that is non-null tnodes with a skip value of 0.
 581         * All of those will be doubled in the resulting inflated tnode, so
 582         * we just count them one extra time here.
 583         *
 584         * A clearer way to write this would be:
 585         *
 586         * to_be_doubled = tn->full_children;
 587         * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
 588         *     tn->full_children;
 589         *
 590         * new_child_length = tnode_child_length(tn) * 2;
 591         *
 592         * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
 593         *      new_child_length;
 594         * if (new_fill_factor >= inflate_threshold)
 595         *
 596         * ...and so on, tho it would mess up the while () loop.
 597         *
 598         * anyway,
 599         * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
 600         *      inflate_threshold
 601         *
 602         * avoid a division:
 603         * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
 604         *      inflate_threshold * new_child_length
 605         *
 606         * expand not_to_be_doubled and to_be_doubled, and shorten:
 607         * 100 * (tnode_child_length(tn) - tn->empty_children +
 608         *    tn->full_children) >= inflate_threshold * new_child_length
 609         *
 610         * expand new_child_length:
 611         * 100 * (tnode_child_length(tn) - tn->empty_children +
 612         *    tn->full_children) >=
 613         *      inflate_threshold * tnode_child_length(tn) * 2
 614         *
 615         * shorten again:
 616         * 50 * (tn->full_children + tnode_child_length(tn) -
 617         *    tn->empty_children) >= inflate_threshold *
 618         *    tnode_child_length(tn)
 619         *
 620         */
 621
 622        check_tnode(tn);
 623
 624        /* Keep root node larger  */
 625
 626        if (!node_parent((struct rt_trie_node *)tn)) {
 627                inflate_threshold_use = inflate_threshold_root;
 628                halve_threshold_use = halve_threshold_root;
 629        } else {
 630                inflate_threshold_use = inflate_threshold;
 631                halve_threshold_use = halve_threshold;
 632        }
 633
 634        max_work = MAX_WORK;
 635        while ((tn->full_children > 0 &&  max_work-- &&
 636                50 * (tn->full_children + tnode_child_length(tn)
 637                      - tn->empty_children)
 638                >= inflate_threshold_use * tnode_child_length(tn))) {
 639
 640                old_tn = tn;
 641                tn = inflate(t, tn);
 642
 643                if (IS_ERR(tn)) {
 644                        tn = old_tn;
 645#ifdef CONFIG_IP_FIB_TRIE_STATS
 646                        t->stats.resize_node_skipped++;
 647#endif
 648                        break;
 649                }
 650        }
 651
 652        check_tnode(tn);
 653
 654        /* Return if at least one inflate is run */
 655        if (max_work != MAX_WORK)
 656                return (struct rt_trie_node *) tn;
 657
 658        /*
 659         * Halve as long as the number of empty children in this
 660         * node is above threshold.
 661         */
 662
 663        max_work = MAX_WORK;
 664        while (tn->bits > 1 &&  max_work-- &&
 665               100 * (tnode_child_length(tn) - tn->empty_children) <
 666               halve_threshold_use * tnode_child_length(tn)) {
 667
 668                old_tn = tn;
 669                tn = halve(t, tn);
 670                if (IS_ERR(tn)) {
 671                        tn = old_tn;
 672#ifdef CONFIG_IP_FIB_TRIE_STATS
 673                        t->stats.resize_node_skipped++;
 674#endif
 675                        break;
 676                }
 677        }
 678
 679
 680        /* Only one child remains */
 681        if (tn->empty_children == tnode_child_length(tn) - 1) {
 682one_child:
 683                for (i = 0; i < tnode_child_length(tn); i++) {
 684                        struct rt_trie_node *n;
 685
 686                        n = rtnl_dereference(tn->child[i]);
 687                        if (!n)
 688                                continue;
 689
 690                        /* compress one level */
 691
 692                        node_set_parent(n, NULL);
 693                        tnode_free_safe(tn);
 694                        return n;
 695                }
 696        }
 697        return (struct rt_trie_node *) tn;
 698}
 699
 700
 701static void tnode_clean_free(struct tnode *tn)
 702{
 703        int i;
 704        struct tnode *tofree;
 705
 706        for (i = 0; i < tnode_child_length(tn); i++) {
 707                tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
 708                if (tofree)
 709                        tnode_free(tofree);
 710        }
 711        tnode_free(tn);
 712}
 713
 714static struct tnode *inflate(struct trie *t, struct tnode *tn)
 715{
 716        struct tnode *oldtnode = tn;
 717        int olen = tnode_child_length(tn);
 718        int i;
 719
 720        pr_debug("In inflate\n");
 721
 722        tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
 723
 724        if (!tn)
 725                return ERR_PTR(-ENOMEM);
 726
 727        /*
 728         * Preallocate and store tnodes before the actual work so we
 729         * don't get into an inconsistent state if memory allocation
 730         * fails. In case of failure we return the oldnode and  inflate
 731         * of tnode is ignored.
 732         */
 733
 734        for (i = 0; i < olen; i++) {
 735                struct tnode *inode;
 736
 737                inode = (struct tnode *) tnode_get_child(oldtnode, i);
 738                if (inode &&
 739                    IS_TNODE(inode) &&
 740                    inode->pos == oldtnode->pos + oldtnode->bits &&
 741                    inode->bits > 1) {
 742                        struct tnode *left, *right;
 743                        t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
 744
 745                        left = tnode_new(inode->key&(~m), inode->pos + 1,
 746                                         inode->bits - 1);
 747                        if (!left)
 748                                goto nomem;
 749
 750                        right = tnode_new(inode->key|m, inode->pos + 1,
 751                                          inode->bits - 1);
 752
 753                        if (!right) {
 754                                tnode_free(left);
 755                                goto nomem;
 756                        }
 757
 758                        put_child(t, tn, 2*i, (struct rt_trie_node *) left);
 759                        put_child(t, tn, 2*i+1, (struct rt_trie_node *) right);
 760                }
 761        }
 762
 763        for (i = 0; i < olen; i++) {
 764                struct tnode *inode;
 765                struct rt_trie_node *node = tnode_get_child(oldtnode, i);
 766                struct tnode *left, *right;
 767                int size, j;
 768
 769                /* An empty child */
 770                if (node == NULL)
 771                        continue;
 772
 773                /* A leaf or an internal node with skipped bits */
 774
 775                if (IS_LEAF(node) || ((struct tnode *) node)->pos >
 776                   tn->pos + tn->bits - 1) {
 777                        if (tkey_extract_bits(node->key,
 778                                              oldtnode->pos + oldtnode->bits,
 779                                              1) == 0)
 780                                put_child(t, tn, 2*i, node);
 781                        else
 782                                put_child(t, tn, 2*i+1, node);
 783                        continue;
 784                }
 785
 786                /* An internal node with two children */
 787                inode = (struct tnode *) node;
 788
 789                if (inode->bits == 1) {
 790                        put_child(t, tn, 2*i, rtnl_dereference(inode->child[0]));
 791                        put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1]));
 792
 793                        tnode_free_safe(inode);
 794                        continue;
 795                }
 796
 797                /* An internal node with more than two children */
 798
 799                /* We will replace this node 'inode' with two new
 800                 * ones, 'left' and 'right', each with half of the
 801                 * original children. The two new nodes will have
 802                 * a position one bit further down the key and this
 803                 * means that the "significant" part of their keys
 804                 * (see the discussion near the top of this file)
 805                 * will differ by one bit, which will be "0" in
 806                 * left's key and "1" in right's key. Since we are
 807                 * moving the key position by one step, the bit that
 808                 * we are moving away from - the bit at position
 809                 * (inode->pos) - is the one that will differ between
 810                 * left and right. So... we synthesize that bit in the
 811                 * two  new keys.
 812                 * The mask 'm' below will be a single "one" bit at
 813                 * the position (inode->pos)
 814                 */
 815
 816                /* Use the old key, but set the new significant
 817                 *   bit to zero.
 818                 */
 819
 820                left = (struct tnode *) tnode_get_child(tn, 2*i);
 821                put_child(t, tn, 2*i, NULL);
 822
 823                BUG_ON(!left);
 824
 825                right = (struct tnode *) tnode_get_child(tn, 2*i+1);
 826                put_child(t, tn, 2*i+1, NULL);
 827
 828                BUG_ON(!right);
 829
 830                size = tnode_child_length(left);
 831                for (j = 0; j < size; j++) {
 832                        put_child(t, left, j, rtnl_dereference(inode->child[j]));
 833                        put_child(t, right, j, rtnl_dereference(inode->child[j + size]));
 834                }
 835                put_child(t, tn, 2*i, resize(t, left));
 836                put_child(t, tn, 2*i+1, resize(t, right));
 837
 838                tnode_free_safe(inode);
 839        }
 840        tnode_free_safe(oldtnode);
 841        return tn;
 842nomem:
 843        tnode_clean_free(tn);
 844        return ERR_PTR(-ENOMEM);
 845}
 846
 847static struct tnode *halve(struct trie *t, struct tnode *tn)
 848{
 849        struct tnode *oldtnode = tn;
 850        struct rt_trie_node *left, *right;
 851        int i;
 852        int olen = tnode_child_length(tn);
 853
 854        pr_debug("In halve\n");
 855
 856        tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
 857
 858        if (!tn)
 859                return ERR_PTR(-ENOMEM);
 860
 861        /*
 862         * Preallocate and store tnodes before the actual work so we
 863         * don't get into an inconsistent state if memory allocation
 864         * fails. In case of failure we return the oldnode and halve
 865         * of tnode is ignored.
 866         */
 867
 868        for (i = 0; i < olen; i += 2) {
 869                left = tnode_get_child(oldtnode, i);
 870                right = tnode_get_child(oldtnode, i+1);
 871
 872                /* Two nonempty children */
 873                if (left && right) {
 874                        struct tnode *newn;
 875
 876                        newn = tnode_new(left->key, tn->pos + tn->bits, 1);
 877
 878                        if (!newn)
 879                                goto nomem;
 880
 881                        put_child(t, tn, i/2, (struct rt_trie_node *)newn);
 882                }
 883
 884        }
 885
 886        for (i = 0; i < olen; i += 2) {
 887                struct tnode *newBinNode;
 888
 889                left = tnode_get_child(oldtnode, i);
 890                right = tnode_get_child(oldtnode, i+1);
 891
 892                /* At least one of the children is empty */
 893                if (left == NULL) {
 894                        if (right == NULL)    /* Both are empty */
 895                                continue;
 896                        put_child(t, tn, i/2, right);
 897                        continue;
 898                }
 899
 900                if (right == NULL) {
 901                        put_child(t, tn, i/2, left);
 902                        continue;
 903                }
 904
 905                /* Two nonempty children */
 906                newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
 907                put_child(t, tn, i/2, NULL);
 908                put_child(t, newBinNode, 0, left);
 909                put_child(t, newBinNode, 1, right);
 910                put_child(t, tn, i/2, resize(t, newBinNode));
 911        }
 912        tnode_free_safe(oldtnode);
 913        return tn;
 914nomem:
 915        tnode_clean_free(tn);
 916        return ERR_PTR(-ENOMEM);
 917}
 918
 919/* readside must use rcu_read_lock currently dump routines
 920 via get_fa_head and dump */
 921
 922static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
 923{
 924        struct hlist_head *head = &l->list;
 925        struct hlist_node *node;
 926        struct leaf_info *li;
 927
 928        hlist_for_each_entry_rcu(li, node, head, hlist)
 929                if (li->plen == plen)
 930                        return li;
 931
 932        return NULL;
 933}
 934
 935static inline struct list_head *get_fa_head(struct leaf *l, int plen)
 936{
 937        struct leaf_info *li = find_leaf_info(l, plen);
 938
 939        if (!li)
 940                return NULL;
 941
 942        return &li->falh;
 943}
 944
 945static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
 946{
 947        struct leaf_info *li = NULL, *last = NULL;
 948        struct hlist_node *node;
 949
 950        if (hlist_empty(head)) {
 951                hlist_add_head_rcu(&new->hlist, head);
 952        } else {
 953                hlist_for_each_entry(li, node, head, hlist) {
 954                        if (new->plen > li->plen)
 955                                break;
 956
 957                        last = li;
 958                }
 959                if (last)
 960                        hlist_add_after_rcu(&last->hlist, &new->hlist);
 961                else
 962                        hlist_add_before_rcu(&new->hlist, &li->hlist);
 963        }
 964}
 965
 966/* rcu_read_lock needs to be hold by caller from readside */
 967
 968static struct leaf *
 969fib_find_node(struct trie *t, u32 key)
 970{
 971        int pos;
 972        struct tnode *tn;
 973        struct rt_trie_node *n;
 974
 975        pos = 0;
 976        n = rcu_dereference_rtnl(t->trie);
 977
 978        while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
 979                tn = (struct tnode *) n;
 980
 981                check_tnode(tn);
 982
 983                if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
 984                        pos = tn->pos + tn->bits;
 985                        n = tnode_get_child_rcu(tn,
 986                                                tkey_extract_bits(key,
 987                                                                  tn->pos,
 988                                                                  tn->bits));
 989                } else
 990                        break;
 991        }
 992        /* Case we have found a leaf. Compare prefixes */
 993
 994        if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
 995                return (struct leaf *)n;
 996
 997        return NULL;
 998}
 999
1000static void trie_rebalance(struct trie *t, struct tnode *tn)
1001{
1002        int wasfull;
1003        t_key cindex, key;
1004        struct tnode *tp;
1005
1006        key = tn->key;
1007
1008        while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
1009                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1010                wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1011                tn = (struct tnode *) resize(t, (struct tnode *)tn);
1012
1013                tnode_put_child_reorg((struct tnode *)tp, cindex,
1014                                      (struct rt_trie_node *)tn, wasfull);
1015
1016                tp = node_parent((struct rt_trie_node *) tn);
1017                if (!tp)
1018                        rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1019
1020                tnode_free_flush();
1021                if (!tp)
1022                        break;
1023                tn = tp;
1024        }
1025
1026        /* Handle last (top) tnode */
1027        if (IS_TNODE(tn))
1028                tn = (struct tnode *)resize(t, (struct tnode *)tn);
1029
1030        rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1031        tnode_free_flush();
1032}
1033
1034/* only used from updater-side */
1035
1036static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1037{
1038        int pos, newpos;
1039        struct tnode *tp = NULL, *tn = NULL;
1040        struct rt_trie_node *n;
1041        struct leaf *l;
1042        int missbit;
1043        struct list_head *fa_head = NULL;
1044        struct leaf_info *li;
1045        t_key cindex;
1046
1047        pos = 0;
1048        n = rtnl_dereference(t->trie);
1049
1050        /* If we point to NULL, stop. Either the tree is empty and we should
1051         * just put a new leaf in if, or we have reached an empty child slot,
1052         * and we should just put our new leaf in that.
1053         * If we point to a T_TNODE, check if it matches our key. Note that
1054         * a T_TNODE might be skipping any number of bits - its 'pos' need
1055         * not be the parent's 'pos'+'bits'!
1056         *
1057         * If it does match the current key, get pos/bits from it, extract
1058         * the index from our key, push the T_TNODE and walk the tree.
1059         *
1060         * If it doesn't, we have to replace it with a new T_TNODE.
1061         *
1062         * If we point to a T_LEAF, it might or might not have the same key
1063         * as we do. If it does, just change the value, update the T_LEAF's
1064         * value, and return it.
1065         * If it doesn't, we need to replace it with a T_TNODE.
1066         */
1067
1068        while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1069                tn = (struct tnode *) n;
1070
1071                check_tnode(tn);
1072
1073                if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1074                        tp = tn;
1075                        pos = tn->pos + tn->bits;
1076                        n = tnode_get_child(tn,
1077                                            tkey_extract_bits(key,
1078                                                              tn->pos,
1079                                                              tn->bits));
1080
1081                        BUG_ON(n && node_parent(n) != tn);
1082                } else
1083                        break;
1084        }
1085
1086        /*
1087         * n  ----> NULL, LEAF or TNODE
1088         *
1089         * tp is n's (parent) ----> NULL or TNODE
1090         */
1091
1092        BUG_ON(tp && IS_LEAF(tp));
1093
1094        /* Case 1: n is a leaf. Compare prefixes */
1095
1096        if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1097                l = (struct leaf *) n;
1098                li = leaf_info_new(plen);
1099
1100                if (!li)
1101                        return NULL;
1102
1103                fa_head = &li->falh;
1104                insert_leaf_info(&l->list, li);
1105                goto done;
1106        }
1107        l = leaf_new();
1108
1109        if (!l)
1110                return NULL;
1111
1112        l->key = key;
1113        li = leaf_info_new(plen);
1114
1115        if (!li) {
1116                free_leaf(l);
1117                return NULL;
1118        }
1119
1120        fa_head = &li->falh;
1121        insert_leaf_info(&l->list, li);
1122
1123        if (t->trie && n == NULL) {
1124                /* Case 2: n is NULL, and will just insert a new leaf */
1125
1126                node_set_parent((struct rt_trie_node *)l, tp);
1127
1128                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1129                put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l);
1130        } else {
1131                /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1132                /*
1133                 *  Add a new tnode here
1134                 *  first tnode need some special handling
1135                 */
1136
1137                if (tp)
1138                        pos = tp->pos+tp->bits;
1139                else
1140                        pos = 0;
1141
1142                if (n) {
1143                        newpos = tkey_mismatch(key, pos, n->key);
1144                        tn = tnode_new(n->key, newpos, 1);
1145                } else {
1146                        newpos = 0;
1147                        tn = tnode_new(key, newpos, 1); /* First tnode */
1148                }
1149
1150                if (!tn) {
1151                        free_leaf_info(li);
1152                        free_leaf(l);
1153                        return NULL;
1154                }
1155
1156                node_set_parent((struct rt_trie_node *)tn, tp);
1157
1158                missbit = tkey_extract_bits(key, newpos, 1);
1159                put_child(t, tn, missbit, (struct rt_trie_node *)l);
1160                put_child(t, tn, 1-missbit, n);
1161
1162                if (tp) {
1163                        cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1164                        put_child(t, (struct tnode *)tp, cindex,
1165                                  (struct rt_trie_node *)tn);
1166                } else {
1167                        rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1168                        tp = tn;
1169                }
1170        }
1171
1172        if (tp && tp->pos + tp->bits > 32)
1173                pr_warning("fib_trie"
1174                           " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1175                           tp, tp->pos, tp->bits, key, plen);
1176
1177        /* Rebalance the trie */
1178
1179        trie_rebalance(t, tp);
1180done:
1181        return fa_head;
1182}
1183
1184/*
1185 * Caller must hold RTNL.
1186 */
1187int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1188{
1189        struct trie *t = (struct trie *) tb->tb_data;
1190        struct fib_alias *fa, *new_fa;
1191        struct list_head *fa_head = NULL;
1192        struct fib_info *fi;
1193        int plen = cfg->fc_dst_len;
1194        u8 tos = cfg->fc_tos;
1195        u32 key, mask;
1196        int err;
1197        struct leaf *l;
1198
1199        if (plen > 32)
1200                return -EINVAL;
1201
1202        key = ntohl(cfg->fc_dst);
1203
1204        pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1205
1206        mask = ntohl(inet_make_mask(plen));
1207
1208        if (key & ~mask)
1209                return -EINVAL;
1210
1211        key = key & mask;
1212
1213        fi = fib_create_info(cfg);
1214        if (IS_ERR(fi)) {
1215                err = PTR_ERR(fi);
1216                goto err;
1217        }
1218
1219        l = fib_find_node(t, key);
1220        fa = NULL;
1221
1222        if (l) {
1223                fa_head = get_fa_head(l, plen);
1224                fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1225        }
1226
1227        /* Now fa, if non-NULL, points to the first fib alias
1228         * with the same keys [prefix,tos,priority], if such key already
1229         * exists or to the node before which we will insert new one.
1230         *
1231         * If fa is NULL, we will need to allocate a new one and
1232         * insert to the head of f.
1233         *
1234         * If f is NULL, no fib node matched the destination key
1235         * and we need to allocate a new one of those as well.
1236         */
1237
1238        if (fa && fa->fa_tos == tos &&
1239            fa->fa_info->fib_priority == fi->fib_priority) {
1240                struct fib_alias *fa_first, *fa_match;
1241
1242                err = -EEXIST;
1243                if (cfg->fc_nlflags & NLM_F_EXCL)
1244                        goto out;
1245
1246                /* We have 2 goals:
1247                 * 1. Find exact match for type, scope, fib_info to avoid
1248                 * duplicate routes
1249                 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1250                 */
1251                fa_match = NULL;
1252                fa_first = fa;
1253                fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1254                list_for_each_entry_continue(fa, fa_head, fa_list) {
1255                        if (fa->fa_tos != tos)
1256                                break;
1257                        if (fa->fa_info->fib_priority != fi->fib_priority)
1258                                break;
1259                        if (fa->fa_type == cfg->fc_type &&
1260                            fa->fa_info == fi) {
1261                                fa_match = fa;
1262                                break;
1263                        }
1264                }
1265
1266                if (cfg->fc_nlflags & NLM_F_REPLACE) {
1267                        struct fib_info *fi_drop;
1268                        u8 state;
1269
1270                        fa = fa_first;
1271                        if (fa_match) {
1272                                if (fa == fa_match)
1273                                        err = 0;
1274                                goto out;
1275                        }
1276                        err = -ENOBUFS;
1277                        new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1278                        if (new_fa == NULL)
1279                                goto out;
1280
1281                        fi_drop = fa->fa_info;
1282                        new_fa->fa_tos = fa->fa_tos;
1283                        new_fa->fa_info = fi;
1284                        new_fa->fa_type = cfg->fc_type;
1285                        state = fa->fa_state;
1286                        new_fa->fa_state = state & ~FA_S_ACCESSED;
1287
1288                        list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1289                        alias_free_mem_rcu(fa);
1290
1291                        fib_release_info(fi_drop);
1292                        if (state & FA_S_ACCESSED)
1293                                rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1294                        rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1295                                tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1296
1297                        goto succeeded;
1298                }
1299                /* Error if we find a perfect match which
1300                 * uses the same scope, type, and nexthop
1301                 * information.
1302                 */
1303                if (fa_match)
1304                        goto out;
1305
1306                if (!(cfg->fc_nlflags & NLM_F_APPEND))
1307                        fa = fa_first;
1308        }
1309        err = -ENOENT;
1310        if (!(cfg->fc_nlflags & NLM_F_CREATE))
1311                goto out;
1312
1313        err = -ENOBUFS;
1314        new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1315        if (new_fa == NULL)
1316                goto out;
1317
1318        new_fa->fa_info = fi;
1319        new_fa->fa_tos = tos;
1320        new_fa->fa_type = cfg->fc_type;
1321        new_fa->fa_state = 0;
1322        /*
1323         * Insert new entry to the list.
1324         */
1325
1326        if (!fa_head) {
1327                fa_head = fib_insert_node(t, key, plen);
1328                if (unlikely(!fa_head)) {
1329                        err = -ENOMEM;
1330                        goto out_free_new_fa;
1331                }
1332        }
1333
1334        if (!plen)
1335                tb->tb_num_default++;
1336
1337        list_add_tail_rcu(&new_fa->fa_list,
1338                          (fa ? &fa->fa_list : fa_head));
1339
1340        rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1341        rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1342                  &cfg->fc_nlinfo, 0);
1343succeeded:
1344        return 0;
1345
1346out_free_new_fa:
1347        kmem_cache_free(fn_alias_kmem, new_fa);
1348out:
1349        fib_release_info(fi);
1350err:
1351        return err;
1352}
1353
1354/* should be called with rcu_read_lock */
1355static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1356                      t_key key,  const struct flowi4 *flp,
1357                      struct fib_result *res, int fib_flags)
1358{
1359        struct leaf_info *li;
1360        struct hlist_head *hhead = &l->list;
1361        struct hlist_node *node;
1362
1363        hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1364                struct fib_alias *fa;
1365
1366                if (l->key != (key & li->mask_plen))
1367                        continue;
1368
1369                list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1370                        struct fib_info *fi = fa->fa_info;
1371                        int nhsel, err;
1372
1373                        if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1374                                continue;
1375                        if (fa->fa_info->fib_scope < flp->flowi4_scope)
1376                                continue;
1377                        fib_alias_accessed(fa);
1378                        err = fib_props[fa->fa_type].error;
1379                        if (err) {
1380#ifdef CONFIG_IP_FIB_TRIE_STATS
1381                                t->stats.semantic_match_passed++;
1382#endif
1383                                return err;
1384                        }
1385                        if (fi->fib_flags & RTNH_F_DEAD)
1386                                continue;
1387                        for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1388                                const struct fib_nh *nh = &fi->fib_nh[nhsel];
1389
1390                                if (nh->nh_flags & RTNH_F_DEAD)
1391                                        continue;
1392                                if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1393                                        continue;
1394
1395#ifdef CONFIG_IP_FIB_TRIE_STATS
1396                                t->stats.semantic_match_passed++;
1397#endif
1398                                res->prefixlen = li->plen;
1399                                res->nh_sel = nhsel;
1400                                res->type = fa->fa_type;
1401                                res->scope = fa->fa_info->fib_scope;
1402                                res->fi = fi;
1403                                res->table = tb;
1404                                res->fa_head = &li->falh;
1405                                if (!(fib_flags & FIB_LOOKUP_NOREF))
1406                                        atomic_inc(&fi->fib_clntref);
1407                                return 0;
1408                        }
1409                }
1410
1411#ifdef CONFIG_IP_FIB_TRIE_STATS
1412                t->stats.semantic_match_miss++;
1413#endif
1414        }
1415
1416        return 1;
1417}
1418
1419int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1420                     struct fib_result *res, int fib_flags)
1421{
1422        struct trie *t = (struct trie *) tb->tb_data;
1423        int ret;
1424        struct rt_trie_node *n;
1425        struct tnode *pn;
1426        unsigned int pos, bits;
1427        t_key key = ntohl(flp->daddr);
1428        unsigned int chopped_off;
1429        t_key cindex = 0;
1430        unsigned int current_prefix_length = KEYLENGTH;
1431        struct tnode *cn;
1432        t_key pref_mismatch;
1433
1434        rcu_read_lock();
1435
1436        n = rcu_dereference(t->trie);
1437        if (!n)
1438                goto failed;
1439
1440#ifdef CONFIG_IP_FIB_TRIE_STATS
1441        t->stats.gets++;
1442#endif
1443
1444        /* Just a leaf? */
1445        if (IS_LEAF(n)) {
1446                ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1447                goto found;
1448        }
1449
1450        pn = (struct tnode *) n;
1451        chopped_off = 0;
1452
1453        while (pn) {
1454                pos = pn->pos;
1455                bits = pn->bits;
1456
1457                if (!chopped_off)
1458                        cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1459                                                   pos, bits);
1460
1461                n = tnode_get_child_rcu(pn, cindex);
1462
1463                if (n == NULL) {
1464#ifdef CONFIG_IP_FIB_TRIE_STATS
1465                        t->stats.null_node_hit++;
1466#endif
1467                        goto backtrace;
1468                }
1469
1470                if (IS_LEAF(n)) {
1471                        ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1472                        if (ret > 0)
1473                                goto backtrace;
1474                        goto found;
1475                }
1476
1477                cn = (struct tnode *)n;
1478
1479                /*
1480                 * It's a tnode, and we can do some extra checks here if we
1481                 * like, to avoid descending into a dead-end branch.
1482                 * This tnode is in the parent's child array at index
1483                 * key[p_pos..p_pos+p_bits] but potentially with some bits
1484                 * chopped off, so in reality the index may be just a
1485                 * subprefix, padded with zero at the end.
1486                 * We can also take a look at any skipped bits in this
1487                 * tnode - everything up to p_pos is supposed to be ok,
1488                 * and the non-chopped bits of the index (se previous
1489                 * paragraph) are also guaranteed ok, but the rest is
1490                 * considered unknown.
1491                 *
1492                 * The skipped bits are key[pos+bits..cn->pos].
1493                 */
1494
1495                /* If current_prefix_length < pos+bits, we are already doing
1496                 * actual prefix  matching, which means everything from
1497                 * pos+(bits-chopped_off) onward must be zero along some
1498                 * branch of this subtree - otherwise there is *no* valid
1499                 * prefix present. Here we can only check the skipped
1500                 * bits. Remember, since we have already indexed into the
1501                 * parent's child array, we know that the bits we chopped of
1502                 * *are* zero.
1503                 */
1504
1505                /* NOTA BENE: Checking only skipped bits
1506                   for the new node here */
1507
1508                if (current_prefix_length < pos+bits) {
1509                        if (tkey_extract_bits(cn->key, current_prefix_length,
1510                                                cn->pos - current_prefix_length)
1511                            || !(cn->child[0]))
1512                                goto backtrace;
1513                }
1514
1515                /*
1516                 * If chopped_off=0, the index is fully validated and we
1517                 * only need to look at the skipped bits for this, the new,
1518                 * tnode. What we actually want to do is to find out if
1519                 * these skipped bits match our key perfectly, or if we will
1520                 * have to count on finding a matching prefix further down,
1521                 * because if we do, we would like to have some way of
1522                 * verifying the existence of such a prefix at this point.
1523                 */
1524
1525                /* The only thing we can do at this point is to verify that
1526                 * any such matching prefix can indeed be a prefix to our
1527                 * key, and if the bits in the node we are inspecting that
1528                 * do not match our key are not ZERO, this cannot be true.
1529                 * Thus, find out where there is a mismatch (before cn->pos)
1530                 * and verify that all the mismatching bits are zero in the
1531                 * new tnode's key.
1532                 */
1533
1534                /*
1535                 * Note: We aren't very concerned about the piece of
1536                 * the key that precede pn->pos+pn->bits, since these
1537                 * have already been checked. The bits after cn->pos
1538                 * aren't checked since these are by definition
1539                 * "unknown" at this point. Thus, what we want to see
1540                 * is if we are about to enter the "prefix matching"
1541                 * state, and in that case verify that the skipped
1542                 * bits that will prevail throughout this subtree are
1543                 * zero, as they have to be if we are to find a
1544                 * matching prefix.
1545                 */
1546
1547                pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
1548
1549                /*
1550                 * In short: If skipped bits in this node do not match
1551                 * the search key, enter the "prefix matching"
1552                 * state.directly.
1553                 */
1554                if (pref_mismatch) {
1555                        int mp = KEYLENGTH - fls(pref_mismatch);
1556
1557                        if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
1558                                goto backtrace;
1559
1560                        if (current_prefix_length >= cn->pos)
1561                                current_prefix_length = mp;
1562                }
1563
1564                pn = (struct tnode *)n; /* Descend */
1565                chopped_off = 0;
1566                continue;
1567
1568backtrace:
1569                chopped_off++;
1570
1571                /* As zero don't change the child key (cindex) */
1572                while ((chopped_off <= pn->bits)
1573                       && !(cindex & (1<<(chopped_off-1))))
1574                        chopped_off++;
1575
1576                /* Decrease current_... with bits chopped off */
1577                if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1578                        current_prefix_length = pn->pos + pn->bits
1579                                - chopped_off;
1580
1581                /*
1582                 * Either we do the actual chop off according or if we have
1583                 * chopped off all bits in this tnode walk up to our parent.
1584                 */
1585
1586                if (chopped_off <= pn->bits) {
1587                        cindex &= ~(1 << (chopped_off-1));
1588                } else {
1589                        struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
1590                        if (!parent)
1591                                goto failed;
1592
1593                        /* Get Child's index */
1594                        cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1595                        pn = parent;
1596                        chopped_off = 0;
1597
1598#ifdef CONFIG_IP_FIB_TRIE_STATS
1599                        t->stats.backtrack++;
1600#endif
1601                        goto backtrace;
1602                }
1603        }
1604failed:
1605        ret = 1;
1606found:
1607        rcu_read_unlock();
1608        return ret;
1609}
1610EXPORT_SYMBOL_GPL(fib_table_lookup);
1611
1612/*
1613 * Remove the leaf and return parent.
1614 */
1615static void trie_leaf_remove(struct trie *t, struct leaf *l)
1616{
1617        struct tnode *tp = node_parent((struct rt_trie_node *) l);
1618
1619        pr_debug("entering trie_leaf_remove(%p)\n", l);
1620
1621        if (tp) {
1622                t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1623                put_child(t, (struct tnode *)tp, cindex, NULL);
1624                trie_rebalance(t, tp);
1625        } else
1626                RCU_INIT_POINTER(t->trie, NULL);
1627
1628        free_leaf(l);
1629}
1630
1631/*
1632 * Caller must hold RTNL.
1633 */
1634int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1635{
1636        struct trie *t = (struct trie *) tb->tb_data;
1637        u32 key, mask;
1638        int plen = cfg->fc_dst_len;
1639        u8 tos = cfg->fc_tos;
1640        struct fib_alias *fa, *fa_to_delete;
1641        struct list_head *fa_head;
1642        struct leaf *l;
1643        struct leaf_info *li;
1644
1645        if (plen > 32)
1646                return -EINVAL;
1647
1648        key = ntohl(cfg->fc_dst);
1649        mask = ntohl(inet_make_mask(plen));
1650
1651        if (key & ~mask)
1652                return -EINVAL;
1653
1654        key = key & mask;
1655        l = fib_find_node(t, key);
1656
1657        if (!l)
1658                return -ESRCH;
1659
1660        fa_head = get_fa_head(l, plen);
1661        fa = fib_find_alias(fa_head, tos, 0);
1662
1663        if (!fa)
1664                return -ESRCH;
1665
1666        pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1667
1668        fa_to_delete = NULL;
1669        fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1670        list_for_each_entry_continue(fa, fa_head, fa_list) {
1671                struct fib_info *fi = fa->fa_info;
1672
1673                if (fa->fa_tos != tos)
1674                        break;
1675
1676                if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1677                    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1678                     fa->fa_info->fib_scope == cfg->fc_scope) &&
1679                    (!cfg->fc_prefsrc ||
1680                     fi->fib_prefsrc == cfg->fc_prefsrc) &&
1681                    (!cfg->fc_protocol ||
1682                     fi->fib_protocol == cfg->fc_protocol) &&
1683                    fib_nh_match(cfg, fi) == 0) {
1684                        fa_to_delete = fa;
1685                        break;
1686                }
1687        }
1688
1689        if (!fa_to_delete)
1690                return -ESRCH;
1691
1692        fa = fa_to_delete;
1693        rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1694                  &cfg->fc_nlinfo, 0);
1695
1696        l = fib_find_node(t, key);
1697        li = find_leaf_info(l, plen);
1698
1699        list_del_rcu(&fa->fa_list);
1700
1701        if (!plen)
1702                tb->tb_num_default--;
1703
1704        if (list_empty(fa_head)) {
1705                hlist_del_rcu(&li->hlist);
1706                free_leaf_info(li);
1707        }
1708
1709        if (hlist_empty(&l->list))
1710                trie_leaf_remove(t, l);
1711
1712        if (fa->fa_state & FA_S_ACCESSED)
1713                rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1714
1715        fib_release_info(fa->fa_info);
1716        alias_free_mem_rcu(fa);
1717        return 0;
1718}
1719
1720static int trie_flush_list(struct list_head *head)
1721{
1722        struct fib_alias *fa, *fa_node;
1723        int found = 0;
1724
1725        list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1726                struct fib_info *fi = fa->fa_info;
1727
1728                if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1729                        list_del_rcu(&fa->fa_list);
1730                        fib_release_info(fa->fa_info);
1731                        alias_free_mem_rcu(fa);
1732                        found++;
1733                }
1734        }
1735        return found;
1736}
1737
1738static int trie_flush_leaf(struct leaf *l)
1739{
1740        int found = 0;
1741        struct hlist_head *lih = &l->list;
1742        struct hlist_node *node, *tmp;
1743        struct leaf_info *li = NULL;
1744
1745        hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1746                found += trie_flush_list(&li->falh);
1747
1748                if (list_empty(&li->falh)) {
1749                        hlist_del_rcu(&li->hlist);
1750                        free_leaf_info(li);
1751                }
1752        }
1753        return found;
1754}
1755
1756/*
1757 * Scan for the next right leaf starting at node p->child[idx]
1758 * Since we have back pointer, no recursion necessary.
1759 */
1760static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
1761{
1762        do {
1763                t_key idx;
1764
1765                if (c)
1766                        idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1767                else
1768                        idx = 0;
1769
1770                while (idx < 1u << p->bits) {
1771                        c = tnode_get_child_rcu(p, idx++);
1772                        if (!c)
1773                                continue;
1774
1775                        if (IS_LEAF(c)) {
1776                                prefetch(rcu_dereference_rtnl(p->child[idx]));
1777                                return (struct leaf *) c;
1778                        }
1779
1780                        /* Rescan start scanning in new node */
1781                        p = (struct tnode *) c;
1782                        idx = 0;
1783                }
1784
1785                /* Node empty, walk back up to parent */
1786                c = (struct rt_trie_node *) p;
1787        } while ((p = node_parent_rcu(c)) != NULL);
1788
1789        return NULL; /* Root of trie */
1790}
1791
1792static struct leaf *trie_firstleaf(struct trie *t)
1793{
1794        struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
1795
1796        if (!n)
1797                return NULL;
1798
1799        if (IS_LEAF(n))          /* trie is just a leaf */
1800                return (struct leaf *) n;
1801
1802        return leaf_walk_rcu(n, NULL);
1803}
1804
1805static struct leaf *trie_nextleaf(struct leaf *l)
1806{
1807        struct rt_trie_node *c = (struct rt_trie_node *) l;
1808        struct tnode *p = node_parent_rcu(c);
1809
1810        if (!p)
1811                return NULL;    /* trie with just one leaf */
1812
1813        return leaf_walk_rcu(p, c);
1814}
1815
1816static struct leaf *trie_leafindex(struct trie *t, int index)
1817{
1818        struct leaf *l = trie_firstleaf(t);
1819
1820        while (l && index-- > 0)
1821                l = trie_nextleaf(l);
1822
1823        return l;
1824}
1825
1826
1827/*
1828 * Caller must hold RTNL.
1829 */
1830int fib_table_flush(struct fib_table *tb)
1831{
1832        struct trie *t = (struct trie *) tb->tb_data;
1833        struct leaf *l, *ll = NULL;
1834        int found = 0;
1835
1836        for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1837                found += trie_flush_leaf(l);
1838
1839                if (ll && hlist_empty(&ll->list))
1840                        trie_leaf_remove(t, ll);
1841                ll = l;
1842        }
1843
1844        if (ll && hlist_empty(&ll->list))
1845                trie_leaf_remove(t, ll);
1846
1847        pr_debug("trie_flush found=%d\n", found);
1848        return found;
1849}
1850
1851void fib_free_table(struct fib_table *tb)
1852{
1853        kfree(tb);
1854}
1855
1856static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1857                           struct fib_table *tb,
1858                           struct sk_buff *skb, struct netlink_callback *cb)
1859{
1860        int i, s_i;
1861        struct fib_alias *fa;
1862        __be32 xkey = htonl(key);
1863
1864        s_i = cb->args[5];
1865        i = 0;
1866
1867        /* rcu_read_lock is hold by caller */
1868
1869        list_for_each_entry_rcu(fa, fah, fa_list) {
1870                if (i < s_i) {
1871                        i++;
1872                        continue;
1873                }
1874
1875                if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1876                                  cb->nlh->nlmsg_seq,
1877                                  RTM_NEWROUTE,
1878                                  tb->tb_id,
1879                                  fa->fa_type,
1880                                  xkey,
1881                                  plen,
1882                                  fa->fa_tos,
1883                                  fa->fa_info, NLM_F_MULTI) < 0) {
1884                        cb->args[5] = i;
1885                        return -1;
1886                }
1887                i++;
1888        }
1889        cb->args[5] = i;
1890        return skb->len;
1891}
1892
1893static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1894                        struct sk_buff *skb, struct netlink_callback *cb)
1895{
1896        struct leaf_info *li;
1897        struct hlist_node *node;
1898        int i, s_i;
1899
1900        s_i = cb->args[4];
1901        i = 0;
1902
1903        /* rcu_read_lock is hold by caller */
1904        hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1905                if (i < s_i) {
1906                        i++;
1907                        continue;
1908                }
1909
1910                if (i > s_i)
1911                        cb->args[5] = 0;
1912
1913                if (list_empty(&li->falh))
1914                        continue;
1915
1916                if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1917                        cb->args[4] = i;
1918                        return -1;
1919                }
1920                i++;
1921        }
1922
1923        cb->args[4] = i;
1924        return skb->len;
1925}
1926
1927int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1928                   struct netlink_callback *cb)
1929{
1930        struct leaf *l;
1931        struct trie *t = (struct trie *) tb->tb_data;
1932        t_key key = cb->args[2];
1933        int count = cb->args[3];
1934
1935        rcu_read_lock();
1936        /* Dump starting at last key.
1937         * Note: 0.0.0.0/0 (ie default) is first key.
1938         */
1939        if (count == 0)
1940                l = trie_firstleaf(t);
1941        else {
1942                /* Normally, continue from last key, but if that is missing
1943                 * fallback to using slow rescan
1944                 */
1945                l = fib_find_node(t, key);
1946                if (!l)
1947                        l = trie_leafindex(t, count);
1948        }
1949
1950        while (l) {
1951                cb->args[2] = l->key;
1952                if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1953                        cb->args[3] = count;
1954                        rcu_read_unlock();
1955                        return -1;
1956                }
1957
1958                ++count;
1959                l = trie_nextleaf(l);
1960                memset(&cb->args[4], 0,
1961                       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1962        }
1963        cb->args[3] = count;
1964        rcu_read_unlock();
1965
1966        return skb->len;
1967}
1968
1969void __init fib_trie_init(void)
1970{
1971        fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1972                                          sizeof(struct fib_alias),
1973                                          0, SLAB_PANIC, NULL);
1974
1975        trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1976                                           max(sizeof(struct leaf),
1977                                               sizeof(struct leaf_info)),
1978                                           0, SLAB_PANIC, NULL);
1979}
1980
1981
1982struct fib_table *fib_trie_table(u32 id)
1983{
1984        struct fib_table *tb;
1985        struct trie *t;
1986
1987        tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1988                     GFP_KERNEL);
1989        if (tb == NULL)
1990                return NULL;
1991
1992        tb->tb_id = id;
1993        tb->tb_default = -1;
1994        tb->tb_num_default = 0;
1995
1996        t = (struct trie *) tb->tb_data;
1997        memset(t, 0, sizeof(*t));
1998
1999        return tb;
2000}
2001
2002#ifdef CONFIG_PROC_FS
2003/* Depth first Trie walk iterator */
2004struct fib_trie_iter {
2005        struct seq_net_private p;
2006        struct fib_table *tb;
2007        struct tnode *tnode;
2008        unsigned int index;
2009        unsigned int depth;
2010};
2011
2012static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
2013{
2014        struct tnode *tn = iter->tnode;
2015        unsigned int cindex = iter->index;
2016        struct tnode *p;
2017
2018        /* A single entry routing table */
2019        if (!tn)
2020                return NULL;
2021
2022        pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2023                 iter->tnode, iter->index, iter->depth);
2024rescan:
2025        while (cindex < (1<<tn->bits)) {
2026                struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
2027
2028                if (n) {
2029                        if (IS_LEAF(n)) {
2030                                iter->tnode = tn;
2031                                iter->index = cindex + 1;
2032                        } else {
2033                                /* push down one level */
2034                                iter->tnode = (struct tnode *) n;
2035                                iter->index = 0;
2036                                ++iter->depth;
2037                        }
2038                        return n;
2039                }
2040
2041                ++cindex;
2042        }
2043
2044        /* Current node exhausted, pop back up */
2045        p = node_parent_rcu((struct rt_trie_node *)tn);
2046        if (p) {
2047                cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2048                tn = p;
2049                --iter->depth;
2050                goto rescan;
2051        }
2052
2053        /* got root? */
2054        return NULL;
2055}
2056
2057static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2058                                       struct trie *t)
2059{
2060        struct rt_trie_node *n;
2061
2062        if (!t)
2063                return NULL;
2064
2065        n = rcu_dereference(t->trie);
2066        if (!n)
2067                return NULL;
2068
2069        if (IS_TNODE(n)) {
2070                iter->tnode = (struct tnode *) n;
2071                iter->index = 0;
2072                iter->depth = 1;
2073        } else {
2074                iter->tnode = NULL;
2075                iter->index = 0;
2076                iter->depth = 0;
2077        }
2078
2079        return n;
2080}
2081
2082static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2083{
2084        struct rt_trie_node *n;
2085        struct fib_trie_iter iter;
2086
2087        memset(s, 0, sizeof(*s));
2088
2089        rcu_read_lock();
2090        for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2091                if (IS_LEAF(n)) {
2092                        struct leaf *l = (struct leaf *)n;
2093                        struct leaf_info *li;
2094                        struct hlist_node *tmp;
2095
2096                        s->leaves++;
2097                        s->totdepth += iter.depth;
2098                        if (iter.depth > s->maxdepth)
2099                                s->maxdepth = iter.depth;
2100
2101                        hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2102                                ++s->prefixes;
2103                } else {
2104                        const struct tnode *tn = (const struct tnode *) n;
2105                        int i;
2106
2107                        s->tnodes++;
2108                        if (tn->bits < MAX_STAT_DEPTH)
2109                                s->nodesizes[tn->bits]++;
2110
2111                        for (i = 0; i < (1<<tn->bits); i++)
2112                                if (!tn->child[i])
2113                                        s->nullpointers++;
2114                }
2115        }
2116        rcu_read_unlock();
2117}
2118
2119/*
2120 *      This outputs /proc/net/fib_triestats
2121 */
2122static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2123{
2124        unsigned int i, max, pointers, bytes, avdepth;
2125
2126        if (stat->leaves)
2127                avdepth = stat->totdepth*100 / stat->leaves;
2128        else
2129                avdepth = 0;
2130
2131        seq_printf(seq, "\tAver depth:     %u.%02d\n",
2132                   avdepth / 100, avdepth % 100);
2133        seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2134
2135        seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2136        bytes = sizeof(struct leaf) * stat->leaves;
2137
2138        seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2139        bytes += sizeof(struct leaf_info) * stat->prefixes;
2140
2141        seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2142        bytes += sizeof(struct tnode) * stat->tnodes;
2143
2144        max = MAX_STAT_DEPTH;
2145        while (max > 0 && stat->nodesizes[max-1] == 0)
2146                max--;
2147
2148        pointers = 0;
2149        for (i = 1; i <= max; i++)
2150                if (stat->nodesizes[i] != 0) {
2151                        seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2152                        pointers += (1<<i) * stat->nodesizes[i];
2153                }
2154        seq_putc(seq, '\n');
2155        seq_printf(seq, "\tPointers: %u\n", pointers);
2156
2157        bytes += sizeof(struct rt_trie_node *) * pointers;
2158        seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2159        seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2160}
2161
2162#ifdef CONFIG_IP_FIB_TRIE_STATS
2163static void trie_show_usage(struct seq_file *seq,
2164                            const struct trie_use_stats *stats)
2165{
2166        seq_printf(seq, "\nCounters:\n---------\n");
2167        seq_printf(seq, "gets = %u\n", stats->gets);
2168        seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2169        seq_printf(seq, "semantic match passed = %u\n",
2170                   stats->semantic_match_passed);
2171        seq_printf(seq, "semantic match miss = %u\n",
2172                   stats->semantic_match_miss);
2173        seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2174        seq_printf(seq, "skipped node resize = %u\n\n",
2175                   stats->resize_node_skipped);
2176}
2177#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2178
2179static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2180{
2181        if (tb->tb_id == RT_TABLE_LOCAL)
2182                seq_puts(seq, "Local:\n");
2183        else if (tb->tb_id == RT_TABLE_MAIN)
2184                seq_puts(seq, "Main:\n");
2185        else
2186                seq_printf(seq, "Id %d:\n", tb->tb_id);
2187}
2188
2189
2190static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2191{
2192        struct net *net = (struct net *)seq->private;
2193        unsigned int h;
2194
2195        seq_printf(seq,
2196                   "Basic info: size of leaf:"
2197                   " %Zd bytes, size of tnode: %Zd bytes.\n",
2198                   sizeof(struct leaf), sizeof(struct tnode));
2199
2200        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2201                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2202                struct hlist_node *node;
2203                struct fib_table *tb;
2204
2205                hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2206                        struct trie *t = (struct trie *) tb->tb_data;
2207                        struct trie_stat stat;
2208
2209                        if (!t)
2210                                continue;
2211
2212                        fib_table_print(seq, tb);
2213
2214                        trie_collect_stats(t, &stat);
2215                        trie_show_stats(seq, &stat);
2216#ifdef CONFIG_IP_FIB_TRIE_STATS
2217                        trie_show_usage(seq, &t->stats);
2218#endif
2219                }
2220        }
2221
2222        return 0;
2223}
2224
2225static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2226{
2227        return single_open_net(inode, file, fib_triestat_seq_show);
2228}
2229
2230static const struct file_operations fib_triestat_fops = {
2231        .owner  = THIS_MODULE,
2232        .open   = fib_triestat_seq_open,
2233        .read   = seq_read,
2234        .llseek = seq_lseek,
2235        .release = single_release_net,
2236};
2237
2238static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2239{
2240        struct fib_trie_iter *iter = seq->private;
2241        struct net *net = seq_file_net(seq);
2242        loff_t idx = 0;
2243        unsigned int h;
2244
2245        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2246                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2247                struct hlist_node *node;
2248                struct fib_table *tb;
2249
2250                hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2251                        struct rt_trie_node *n;
2252
2253                        for (n = fib_trie_get_first(iter,
2254                                                    (struct trie *) tb->tb_data);
2255                             n; n = fib_trie_get_next(iter))
2256                                if (pos == idx++) {
2257                                        iter->tb = tb;
2258                                        return n;
2259                                }
2260                }
2261        }
2262
2263        return NULL;
2264}
2265
2266static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2267        __acquires(RCU)
2268{
2269        rcu_read_lock();
2270        return fib_trie_get_idx(seq, *pos);
2271}
2272
2273static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2274{
2275        struct fib_trie_iter *iter = seq->private;
2276        struct net *net = seq_file_net(seq);
2277        struct fib_table *tb = iter->tb;
2278        struct hlist_node *tb_node;
2279        unsigned int h;
2280        struct rt_trie_node *n;
2281
2282        ++*pos;
2283        /* next node in same table */
2284        n = fib_trie_get_next(iter);
2285        if (n)
2286                return n;
2287
2288        /* walk rest of this hash chain */
2289        h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2290        while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2291                tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2292                n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2293                if (n)
2294                        goto found;
2295        }
2296
2297        /* new hash chain */
2298        while (++h < FIB_TABLE_HASHSZ) {
2299                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2300                hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2301                        n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2302                        if (n)
2303                                goto found;
2304                }
2305        }
2306        return NULL;
2307
2308found:
2309        iter->tb = tb;
2310        return n;
2311}
2312
2313static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2314        __releases(RCU)
2315{
2316        rcu_read_unlock();
2317}
2318
2319static void seq_indent(struct seq_file *seq, int n)
2320{
2321        while (n-- > 0)
2322                seq_puts(seq, "   ");
2323}
2324
2325static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2326{
2327        switch (s) {
2328        case RT_SCOPE_UNIVERSE: return "universe";
2329        case RT_SCOPE_SITE:     return "site";
2330        case RT_SCOPE_LINK:     return "link";
2331        case RT_SCOPE_HOST:     return "host";
2332        case RT_SCOPE_NOWHERE:  return "nowhere";
2333        default:
2334                snprintf(buf, len, "scope=%d", s);
2335                return buf;
2336        }
2337}
2338
2339static const char *const rtn_type_names[__RTN_MAX] = {
2340        [RTN_UNSPEC] = "UNSPEC",
2341        [RTN_UNICAST] = "UNICAST",
2342        [RTN_LOCAL] = "LOCAL",
2343        [RTN_BROADCAST] = "BROADCAST",
2344        [RTN_ANYCAST] = "ANYCAST",
2345        [RTN_MULTICAST] = "MULTICAST",
2346        [RTN_BLACKHOLE] = "BLACKHOLE",
2347        [RTN_UNREACHABLE] = "UNREACHABLE",
2348        [RTN_PROHIBIT] = "PROHIBIT",
2349        [RTN_THROW] = "THROW",
2350        [RTN_NAT] = "NAT",
2351        [RTN_XRESOLVE] = "XRESOLVE",
2352};
2353
2354static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2355{
2356        if (t < __RTN_MAX && rtn_type_names[t])
2357                return rtn_type_names[t];
2358        snprintf(buf, len, "type %u", t);
2359        return buf;
2360}
2361
2362/* Pretty print the trie */
2363static int fib_trie_seq_show(struct seq_file *seq, void *v)
2364{
2365        const struct fib_trie_iter *iter = seq->private;
2366        struct rt_trie_node *n = v;
2367
2368        if (!node_parent_rcu(n))
2369                fib_table_print(seq, iter->tb);
2370
2371        if (IS_TNODE(n)) {
2372                struct tnode *tn = (struct tnode *) n;
2373                __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2374
2375                seq_indent(seq, iter->depth-1);
2376                seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
2377                           &prf, tn->pos, tn->bits, tn->full_children,
2378                           tn->empty_children);
2379
2380        } else {
2381                struct leaf *l = (struct leaf *) n;
2382                struct leaf_info *li;
2383                struct hlist_node *node;
2384                __be32 val = htonl(l->key);
2385
2386                seq_indent(seq, iter->depth);
2387                seq_printf(seq, "  |-- %pI4\n", &val);
2388
2389                hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2390                        struct fib_alias *fa;
2391
2392                        list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2393                                char buf1[32], buf2[32];
2394
2395                                seq_indent(seq, iter->depth+1);
2396                                seq_printf(seq, "  /%d %s %s", li->plen,
2397                                           rtn_scope(buf1, sizeof(buf1),
2398                                                     fa->fa_info->fib_scope),
2399                                           rtn_type(buf2, sizeof(buf2),
2400                                                    fa->fa_type));
2401                                if (fa->fa_tos)
2402                                        seq_printf(seq, " tos=%d", fa->fa_tos);
2403                                seq_putc(seq, '\n');
2404                        }
2405                }
2406        }
2407
2408        return 0;
2409}
2410
2411static const struct seq_operations fib_trie_seq_ops = {
2412        .start  = fib_trie_seq_start,
2413        .next   = fib_trie_seq_next,
2414        .stop   = fib_trie_seq_stop,
2415        .show   = fib_trie_seq_show,
2416};
2417
2418static int fib_trie_seq_open(struct inode *inode, struct file *file)
2419{
2420        return seq_open_net(inode, file, &fib_trie_seq_ops,
2421                            sizeof(struct fib_trie_iter));
2422}
2423
2424static const struct file_operations fib_trie_fops = {
2425        .owner  = THIS_MODULE,
2426        .open   = fib_trie_seq_open,
2427        .read   = seq_read,
2428        .llseek = seq_lseek,
2429        .release = seq_release_net,
2430};
2431
2432struct fib_route_iter {
2433        struct seq_net_private p;
2434        struct trie *main_trie;
2435        loff_t  pos;
2436        t_key   key;
2437};
2438
2439static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2440{
2441        struct leaf *l = NULL;
2442        struct trie *t = iter->main_trie;
2443
2444        /* use cache location of last found key */
2445        if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2446                pos -= iter->pos;
2447        else {
2448                iter->pos = 0;
2449                l = trie_firstleaf(t);
2450        }
2451
2452        while (l && pos-- > 0) {
2453                iter->pos++;
2454                l = trie_nextleaf(l);
2455        }
2456
2457        if (l)
2458                iter->key = pos;        /* remember it */
2459        else
2460                iter->pos = 0;          /* forget it */
2461
2462        return l;
2463}
2464
2465static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2466        __acquires(RCU)
2467{
2468        struct fib_route_iter *iter = seq->private;
2469        struct fib_table *tb;
2470
2471        rcu_read_lock();
2472        tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2473        if (!tb)
2474                return NULL;
2475
2476        iter->main_trie = (struct trie *) tb->tb_data;
2477        if (*pos == 0)
2478                return SEQ_START_TOKEN;
2479        else
2480                return fib_route_get_idx(iter, *pos - 1);
2481}
2482
2483static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2484{
2485        struct fib_route_iter *iter = seq->private;
2486        struct leaf *l = v;
2487
2488        ++*pos;
2489        if (v == SEQ_START_TOKEN) {
2490                iter->pos = 0;
2491                l = trie_firstleaf(iter->main_trie);
2492        } else {
2493                iter->pos++;
2494                l = trie_nextleaf(l);
2495        }
2496
2497        if (l)
2498                iter->key = l->key;
2499        else
2500                iter->pos = 0;
2501        return l;
2502}
2503
2504static void fib_route_seq_stop(struct seq_file *seq, void *v)
2505        __releases(RCU)
2506{
2507        rcu_read_unlock();
2508}
2509
2510static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2511{
2512        unsigned int flags = 0;
2513
2514        if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2515                flags = RTF_REJECT;
2516        if (fi && fi->fib_nh->nh_gw)
2517                flags |= RTF_GATEWAY;
2518        if (mask == htonl(0xFFFFFFFF))
2519                flags |= RTF_HOST;
2520        flags |= RTF_UP;
2521        return flags;
2522}
2523
2524/*
2525 *      This outputs /proc/net/route.
2526 *      The format of the file is not supposed to be changed
2527 *      and needs to be same as fib_hash output to avoid breaking
2528 *      legacy utilities
2529 */
2530static int fib_route_seq_show(struct seq_file *seq, void *v)
2531{
2532        struct leaf *l = v;
2533        struct leaf_info *li;
2534        struct hlist_node *node;
2535
2536        if (v == SEQ_START_TOKEN) {
2537                seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2538                           "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2539                           "\tWindow\tIRTT");
2540                return 0;
2541        }
2542
2543        hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2544                struct fib_alias *fa;
2545                __be32 mask, prefix;
2546
2547                mask = inet_make_mask(li->plen);
2548                prefix = htonl(l->key);
2549
2550                list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2551                        const struct fib_info *fi = fa->fa_info;
2552                        unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2553                        int len;
2554
2555                        if (fa->fa_type == RTN_BROADCAST
2556                            || fa->fa_type == RTN_MULTICAST)
2557                                continue;
2558
2559                        if (fi)
2560                                seq_printf(seq,
2561                                         "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2562                                         "%d\t%08X\t%d\t%u\t%u%n",
2563                                         fi->fib_dev ? fi->fib_dev->name : "*",
2564                                         prefix,
2565                                         fi->fib_nh->nh_gw, flags, 0, 0,
2566                                         fi->fib_priority,
2567                                         mask,
2568                                         (fi->fib_advmss ?
2569                                          fi->fib_advmss + 40 : 0),
2570                                         fi->fib_window,
2571                                         fi->fib_rtt >> 3, &len);
2572                        else
2573                                seq_printf(seq,
2574                                         "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2575                                         "%d\t%08X\t%d\t%u\t%u%n",
2576                                         prefix, 0, flags, 0, 0, 0,
2577                                         mask, 0, 0, 0, &len);
2578
2579                        seq_printf(seq, "%*s\n", 127 - len, "");
2580                }
2581        }
2582
2583        return 0;
2584}
2585
2586static const struct seq_operations fib_route_seq_ops = {
2587        .start  = fib_route_seq_start,
2588        .next   = fib_route_seq_next,
2589        .stop   = fib_route_seq_stop,
2590        .show   = fib_route_seq_show,
2591};
2592
2593static int fib_route_seq_open(struct inode *inode, struct file *file)
2594{
2595        return seq_open_net(inode, file, &fib_route_seq_ops,
2596                            sizeof(struct fib_route_iter));
2597}
2598
2599static const struct file_operations fib_route_fops = {
2600        .owner  = THIS_MODULE,
2601        .open   = fib_route_seq_open,
2602        .read   = seq_read,
2603        .llseek = seq_lseek,
2604        .release = seq_release_net,
2605};
2606
2607int __net_init fib_proc_init(struct net *net)
2608{
2609        if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2610                goto out1;
2611
2612        if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2613                                  &fib_triestat_fops))
2614                goto out2;
2615
2616        if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2617                goto out3;
2618
2619        return 0;
2620
2621out3:
2622        proc_net_remove(net, "fib_triestat");
2623out2:
2624        proc_net_remove(net, "fib_trie");
2625out1:
2626        return -ENOMEM;
2627}
2628
2629void __net_exit fib_proc_exit(struct net *net)
2630{
2631        proc_net_remove(net, "fib_trie");
2632        proc_net_remove(net, "fib_triestat");
2633        proc_net_remove(net, "route");
2634}
2635
2636#endif /* CONFIG_PROC_FS */
2637
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.