linux/net/ipv4/route.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              ROUTE - implementation of the IP router.
   7 *
   8 * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  13 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  14 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  15 *
  16 * Fixes:
  17 *              Alan Cox        :       Verify area fixes.
  18 *              Alan Cox        :       cli() protects routing changes
  19 *              Rui Oliveira    :       ICMP routing table updates
  20 *              (rco@di.uminho.pt)      Routing table insertion and update
  21 *              Linus Torvalds  :       Rewrote bits to be sensible
  22 *              Alan Cox        :       Added BSD route gw semantics
  23 *              Alan Cox        :       Super /proc >4K 
  24 *              Alan Cox        :       MTU in route table
  25 *              Alan Cox        :       MSS actually. Also added the window
  26 *                                      clamper.
  27 *              Sam Lantinga    :       Fixed route matching in rt_del()
  28 *              Alan Cox        :       Routing cache support.
  29 *              Alan Cox        :       Removed compatibility cruft.
  30 *              Alan Cox        :       RTF_REJECT support.
  31 *              Alan Cox        :       TCP irtt support.
  32 *              Jonathan Naylor :       Added Metric support.
  33 *      Miquel van Smoorenburg  :       BSD API fixes.
  34 *      Miquel van Smoorenburg  :       Metrics.
  35 *              Alan Cox        :       Use __u32 properly
  36 *              Alan Cox        :       Aligned routing errors more closely with BSD
  37 *                                      our system is still very different.
  38 *              Alan Cox        :       Faster /proc handling
  39 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  40 *                                      routing caches and better behaviour.
  41 *              
  42 *              Olaf Erb        :       irtt wasn't being copied right.
  43 *              Bjorn Ekwall    :       Kerneld route support.
  44 *              Alan Cox        :       Multicast fixed (I hope)
  45 *              Pavel Krauz     :       Limited broadcast fixed
  46 *              Mike McLagan    :       Routing by source
  47 *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  48 *                                      route.c and rewritten from scratch.
  49 *              Andi Kleen      :       Load-limit warning messages.
  50 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  51 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  52 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  53 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  54 *              Marc Boucher    :       routing by fwmark
  55 *      Robert Olsson           :       Added rt_cache statistics
  56 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  57 *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  58 *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  59 *      Ilia Sotnikov           :       Removed TOS from hash calculations
  60 *
  61 *              This program is free software; you can redistribute it and/or
  62 *              modify it under the terms of the GNU General Public License
  63 *              as published by the Free Software Foundation; either version
  64 *              2 of the License, or (at your option) any later version.
  65 */
  66
  67#include <linux/module.h>
  68#include <asm/uaccess.h>
  69#include <asm/system.h>
  70#include <linux/bitops.h>
  71#include <linux/types.h>
  72#include <linux/kernel.h>
  73#include <linux/sched.h>
  74#include <linux/mm.h>
  75#include <linux/bootmem.h>
  76#include <linux/string.h>
  77#include <linux/socket.h>
  78#include <linux/sockios.h>
  79#include <linux/errno.h>
  80#include <linux/in.h>
  81#include <linux/inet.h>
  82#include <linux/netdevice.h>
  83#include <linux/proc_fs.h>
  84#include <linux/init.h>
  85#include <linux/skbuff.h>
  86#include <linux/rtnetlink.h>
  87#include <linux/inetdevice.h>
  88#include <linux/igmp.h>
  89#include <linux/pkt_sched.h>
  90#include <linux/mroute.h>
  91#include <linux/netfilter_ipv4.h>
  92#include <linux/random.h>
  93#include <linux/jhash.h>
  94#include <linux/rcupdate.h>
  95#include <linux/times.h>
  96#include <net/protocol.h>
  97#include <net/ip.h>
  98#include <net/route.h>
  99#include <net/inetpeer.h>
 100#include <net/sock.h>
 101#include <net/ip_fib.h>
 102#include <net/arp.h>
 103#include <net/tcp.h>
 104#include <net/icmp.h>
 105#include <net/xfrm.h>
 106#include <net/ip_mp_alg.h>
 107#include <net/netevent.h>
 108#ifdef CONFIG_SYSCTL
 109#include <linux/sysctl.h>
 110#endif
 111
 112#define RT_FL_TOS(oldflp) \
 113    ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 114
 115#define IP_MAX_MTU      0xFFF0
 116
 117#define RT_GC_TIMEOUT (300*HZ)
 118
 119static int ip_rt_min_delay              = 2 * HZ;
 120static int ip_rt_max_delay              = 10 * HZ;
 121static int ip_rt_max_size;
 122static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
 123static int ip_rt_gc_interval            = 60 * HZ;
 124static int ip_rt_gc_min_interval        = HZ / 2;
 125static int ip_rt_redirect_number        = 9;
 126static int ip_rt_redirect_load          = HZ / 50;
 127static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
 128static int ip_rt_error_cost             = HZ;
 129static int ip_rt_error_burst            = 5 * HZ;
 130static int ip_rt_gc_elasticity          = 8;
 131static int ip_rt_mtu_expires            = 10 * 60 * HZ;
 132static int ip_rt_min_pmtu               = 512 + 20 + 20;
 133static int ip_rt_min_advmss             = 256;
 134static int ip_rt_secret_interval        = 10 * 60 * HZ;
 135static unsigned long rt_deadline;
 136
 137#define RTprint(a...)   printk(KERN_DEBUG a)
 138
 139static struct timer_list rt_flush_timer;
 140static struct timer_list rt_periodic_timer;
 141static struct timer_list rt_secret_timer;
 142
 143/*
 144 *      Interface to generic destination cache.
 145 */
 146
 147static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 148static void              ipv4_dst_destroy(struct dst_entry *dst);
 149static void              ipv4_dst_ifdown(struct dst_entry *dst,
 150                                         struct net_device *dev, int how);
 151static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 152static void              ipv4_link_failure(struct sk_buff *skb);
 153static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 154static int rt_garbage_collect(void);
 155
 156
 157static struct dst_ops ipv4_dst_ops = {
 158        .family =               AF_INET,
 159        .protocol =             __constant_htons(ETH_P_IP),
 160        .gc =                   rt_garbage_collect,
 161        .check =                ipv4_dst_check,
 162        .destroy =              ipv4_dst_destroy,
 163        .ifdown =               ipv4_dst_ifdown,
 164        .negative_advice =      ipv4_negative_advice,
 165        .link_failure =         ipv4_link_failure,
 166        .update_pmtu =          ip_rt_update_pmtu,
 167        .entry_size =           sizeof(struct rtable),
 168};
 169
 170#define ECN_OR_COST(class)      TC_PRIO_##class
 171
 172__u8 ip_tos2prio[16] = {
 173        TC_PRIO_BESTEFFORT,
 174        ECN_OR_COST(FILLER),
 175        TC_PRIO_BESTEFFORT,
 176        ECN_OR_COST(BESTEFFORT),
 177        TC_PRIO_BULK,
 178        ECN_OR_COST(BULK),
 179        TC_PRIO_BULK,
 180        ECN_OR_COST(BULK),
 181        TC_PRIO_INTERACTIVE,
 182        ECN_OR_COST(INTERACTIVE),
 183        TC_PRIO_INTERACTIVE,
 184        ECN_OR_COST(INTERACTIVE),
 185        TC_PRIO_INTERACTIVE_BULK,
 186        ECN_OR_COST(INTERACTIVE_BULK),
 187        TC_PRIO_INTERACTIVE_BULK,
 188        ECN_OR_COST(INTERACTIVE_BULK)
 189};
 190
 191
 192/*
 193 * Route cache.
 194 */
 195
 196/* The locking scheme is rather straight forward:
 197 *
 198 * 1) Read-Copy Update protects the buckets of the central route hash.
 199 * 2) Only writers remove entries, and they hold the lock
 200 *    as they look at rtable reference counts.
 201 * 3) Only readers acquire references to rtable entries,
 202 *    they do so with atomic increments and with the
 203 *    lock held.
 204 */
 205
 206struct rt_hash_bucket {
 207        struct rtable   *chain;
 208};
 209#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 210        defined(CONFIG_PROVE_LOCKING)
 211/*
 212 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 213 * The size of this table is a power of two and depends on the number of CPUS.
 214 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 215 */
 216#ifdef CONFIG_LOCKDEP
 217# define RT_HASH_LOCK_SZ        256
 218#else
 219# if NR_CPUS >= 32
 220#  define RT_HASH_LOCK_SZ       4096
 221# elif NR_CPUS >= 16
 222#  define RT_HASH_LOCK_SZ       2048
 223# elif NR_CPUS >= 8
 224#  define RT_HASH_LOCK_SZ       1024
 225# elif NR_CPUS >= 4
 226#  define RT_HASH_LOCK_SZ       512
 227# else
 228#  define RT_HASH_LOCK_SZ       256
 229# endif
 230#endif
 231
 232static spinlock_t       *rt_hash_locks;
 233# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 234# define rt_hash_lock_init()    { \
 235                int i; \
 236                rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
 237                if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
 238                for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
 239                        spin_lock_init(&rt_hash_locks[i]); \
 240                }
 241#else
 242# define rt_hash_lock_addr(slot) NULL
 243# define rt_hash_lock_init()
 244#endif
 245
 246static struct rt_hash_bucket    *rt_hash_table;
 247static unsigned                 rt_hash_mask;
 248static int                      rt_hash_log;
 249static unsigned int             rt_hash_rnd;
 250
 251static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 252#define RT_CACHE_STAT_INC(field) \
 253        (__raw_get_cpu_var(rt_cache_stat).field++)
 254
 255static int rt_intern_hash(unsigned hash, struct rtable *rth,
 256                                struct rtable **res);
 257
 258static unsigned int rt_hash_code(u32 daddr, u32 saddr)
 259{
 260        return (jhash_2words(daddr, saddr, rt_hash_rnd)
 261                & rt_hash_mask);
 262}
 263
 264#define rt_hash(daddr, saddr, idx) \
 265        rt_hash_code((__force u32)(__be32)(daddr),\
 266                     (__force u32)(__be32)(saddr) ^ ((idx) << 5))
 267
 268#ifdef CONFIG_PROC_FS
 269struct rt_cache_iter_state {
 270        int bucket;
 271};
 272
 273static struct rtable *rt_cache_get_first(struct seq_file *seq)
 274{
 275        struct rtable *r = NULL;
 276        struct rt_cache_iter_state *st = seq->private;
 277
 278        for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 279                rcu_read_lock_bh();
 280                r = rt_hash_table[st->bucket].chain;
 281                if (r)
 282                        break;
 283                rcu_read_unlock_bh();
 284        }
 285        return r;
 286}
 287
 288static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
 289{
 290        struct rt_cache_iter_state *st = rcu_dereference(seq->private);
 291
 292        r = r->u.rt_next;
 293        while (!r) {
 294                rcu_read_unlock_bh();
 295                if (--st->bucket < 0)
 296                        break;
 297                rcu_read_lock_bh();
 298                r = rt_hash_table[st->bucket].chain;
 299        }
 300        return r;
 301}
 302
 303static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 304{
 305        struct rtable *r = rt_cache_get_first(seq);
 306
 307        if (r)
 308                while (pos && (r = rt_cache_get_next(seq, r)))
 309                        --pos;
 310        return pos ? NULL : r;
 311}
 312
 313static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 314{
 315        return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 316}
 317
 318static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 319{
 320        struct rtable *r = NULL;
 321
 322        if (v == SEQ_START_TOKEN)
 323                r = rt_cache_get_first(seq);
 324        else
 325                r = rt_cache_get_next(seq, v);
 326        ++*pos;
 327        return r;
 328}
 329
 330static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 331{
 332        if (v && v != SEQ_START_TOKEN)
 333                rcu_read_unlock_bh();
 334}
 335
 336static int rt_cache_seq_show(struct seq_file *seq, void *v)
 337{
 338        if (v == SEQ_START_TOKEN)
 339                seq_printf(seq, "%-127s\n",
 340                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 341                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 342                           "HHUptod\tSpecDst");
 343        else {
 344                struct rtable *r = v;
 345                char temp[256];
 346
 347                sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 348                              "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 349                        r->u.dst.dev ? r->u.dst.dev->name : "*",
 350                        (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 351                        r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 352                        r->u.dst.__use, 0, (unsigned long)r->rt_src,
 353                        (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 354                             (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 355                        dst_metric(&r->u.dst, RTAX_WINDOW),
 356                        (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 357                              dst_metric(&r->u.dst, RTAX_RTTVAR)),
 358                        r->fl.fl4_tos,
 359                        r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 360                        r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 361                                       dev_queue_xmit) : 0,
 362                        r->rt_spec_dst);
 363                seq_printf(seq, "%-127s\n", temp);
 364        }
 365        return 0;
 366}
 367
 368static struct seq_operations rt_cache_seq_ops = {
 369        .start  = rt_cache_seq_start,
 370        .next   = rt_cache_seq_next,
 371        .stop   = rt_cache_seq_stop,
 372        .show   = rt_cache_seq_show,
 373};
 374
 375static int rt_cache_seq_open(struct inode *inode, struct file *file)
 376{
 377        struct seq_file *seq;
 378        int rc = -ENOMEM;
 379        struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
 380
 381        if (!s)
 382                goto out;
 383        rc = seq_open(file, &rt_cache_seq_ops);
 384        if (rc)
 385                goto out_kfree;
 386        seq          = file->private_data;
 387        seq->private = s;
 388        memset(s, 0, sizeof(*s));
 389out:
 390        return rc;
 391out_kfree:
 392        kfree(s);
 393        goto out;
 394}
 395
 396static struct file_operations rt_cache_seq_fops = {
 397        .owner   = THIS_MODULE,
 398        .open    = rt_cache_seq_open,
 399        .read    = seq_read,
 400        .llseek  = seq_lseek,
 401        .release = seq_release_private,
 402};
 403
 404
 405static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 406{
 407        int cpu;
 408
 409        if (*pos == 0)
 410                return SEQ_START_TOKEN;
 411
 412        for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
 413                if (!cpu_possible(cpu))
 414                        continue;
 415                *pos = cpu+1;
 416                return &per_cpu(rt_cache_stat, cpu);
 417        }
 418        return NULL;
 419}
 420
 421static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 422{
 423        int cpu;
 424
 425        for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 426                if (!cpu_possible(cpu))
 427                        continue;
 428                *pos = cpu+1;
 429                return &per_cpu(rt_cache_stat, cpu);
 430        }
 431        return NULL;
 432        
 433}
 434
 435static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 436{
 437
 438}
 439
 440static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 441{
 442        struct rt_cache_stat *st = v;
 443
 444        if (v == SEQ_START_TOKEN) {
 445                seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 446                return 0;
 447        }
 448        
 449        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 450                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 451                   atomic_read(&ipv4_dst_ops.entries),
 452                   st->in_hit,
 453                   st->in_slow_tot,
 454                   st->in_slow_mc,
 455                   st->in_no_route,
 456                   st->in_brd,
 457                   st->in_martian_dst,
 458                   st->in_martian_src,
 459
 460                   st->out_hit,
 461                   st->out_slow_tot,
 462                   st->out_slow_mc, 
 463
 464                   st->gc_total,
 465                   st->gc_ignored,
 466                   st->gc_goal_miss,
 467                   st->gc_dst_overflow,
 468                   st->in_hlist_search,
 469                   st->out_hlist_search
 470                );
 471        return 0;
 472}
 473
 474static struct seq_operations rt_cpu_seq_ops = {
 475        .start  = rt_cpu_seq_start,
 476        .next   = rt_cpu_seq_next,
 477        .stop   = rt_cpu_seq_stop,
 478        .show   = rt_cpu_seq_show,
 479};
 480
 481
 482static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 483{
 484        return seq_open(file, &rt_cpu_seq_ops);
 485}
 486
 487static struct file_operations rt_cpu_seq_fops = {
 488        .owner   = THIS_MODULE,
 489        .open    = rt_cpu_seq_open,
 490        .read    = seq_read,
 491        .llseek  = seq_lseek,
 492        .release = seq_release,
 493};
 494
 495#endif /* CONFIG_PROC_FS */
 496  
 497static __inline__ void rt_free(struct rtable *rt)
 498{
 499        multipath_remove(rt);
 500        call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 501}
 502
 503static __inline__ void rt_drop(struct rtable *rt)
 504{
 505        multipath_remove(rt);
 506        ip_rt_put(rt);
 507        call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 508}
 509
 510static __inline__ int rt_fast_clean(struct rtable *rth)
 511{
 512        /* Kill broadcast/multicast entries very aggresively, if they
 513           collide in hash table with more useful entries */
 514        return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 515                rth->fl.iif && rth->u.rt_next;
 516}
 517
 518static __inline__ int rt_valuable(struct rtable *rth)
 519{
 520        return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 521                rth->u.dst.expires;
 522}
 523
 524static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 525{
 526        unsigned long age;
 527        int ret = 0;
 528
 529        if (atomic_read(&rth->u.dst.__refcnt))
 530                goto out;
 531
 532        ret = 1;
 533        if (rth->u.dst.expires &&
 534            time_after_eq(jiffies, rth->u.dst.expires))
 535                goto out;
 536
 537        age = jiffies - rth->u.dst.lastuse;
 538        ret = 0;
 539        if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 540            (age <= tmo2 && rt_valuable(rth)))
 541                goto out;
 542        ret = 1;
 543out:    return ret;
 544}
 545
 546/* Bits of score are:
 547 * 31: very valuable
 548 * 30: not quite useless
 549 * 29..0: usage counter
 550 */
 551static inline u32 rt_score(struct rtable *rt)
 552{
 553        u32 score = jiffies - rt->u.dst.lastuse;
 554
 555        score = ~score & ~(3<<30);
 556
 557        if (rt_valuable(rt))
 558                score |= (1<<31);
 559
 560        if (!rt->fl.iif ||
 561            !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 562                score |= (1<<30);
 563
 564        return score;
 565}
 566
 567static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 568{
 569        return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 570                (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
 571                (fl1->mark ^ fl2->mark) |
 572                (*(u16 *)&fl1->nl_u.ip4_u.tos ^
 573                 *(u16 *)&fl2->nl_u.ip4_u.tos) |
 574                (fl1->oif ^ fl2->oif) |
 575                (fl1->iif ^ fl2->iif)) == 0;
 576}
 577
 578#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 579static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
 580                                                struct rtable *expentry,
 581                                                int *removed_count)
 582{
 583        int passedexpired = 0;
 584        struct rtable **nextstep = NULL;
 585        struct rtable **rthp = chain_head;
 586        struct rtable *rth;
 587
 588        if (removed_count)
 589                *removed_count = 0;
 590
 591        while ((rth = *rthp) != NULL) {
 592                if (rth == expentry)
 593                        passedexpired = 1;
 594
 595                if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
 596                    compare_keys(&(*rthp)->fl, &expentry->fl)) {
 597                        if (*rthp == expentry) {
 598                                *rthp = rth->u.rt_next;
 599                                continue;
 600                        } else {
 601                                *rthp = rth->u.rt_next;
 602                                rt_free(rth);
 603                                if (removed_count)
 604                                        ++(*removed_count);
 605                        }
 606                } else {
 607                        if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
 608                            passedexpired && !nextstep)
 609                                nextstep = &rth->u.rt_next;
 610
 611                        rthp = &rth->u.rt_next;
 612                }
 613        }
 614
 615        rt_free(expentry);
 616        if (removed_count)
 617                ++(*removed_count);
 618
 619        return nextstep;
 620}
 621#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 622
 623
 624/* This runs via a timer and thus is always in BH context. */
 625static void rt_check_expire(unsigned long dummy)
 626{
 627        static unsigned int rover;
 628        unsigned int i = rover, goal;
 629        struct rtable *rth, **rthp;
 630        unsigned long now = jiffies;
 631        u64 mult;
 632
 633        mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
 634        if (ip_rt_gc_timeout > 1)
 635                do_div(mult, ip_rt_gc_timeout);
 636        goal = (unsigned int)mult;
 637        if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
 638        for (; goal > 0; goal--) {
 639                unsigned long tmo = ip_rt_gc_timeout;
 640
 641                i = (i + 1) & rt_hash_mask;
 642                rthp = &rt_hash_table[i].chain;
 643
 644                if (*rthp == 0)
 645                        continue;
 646                spin_lock(rt_hash_lock_addr(i));
 647                while ((rth = *rthp) != NULL) {
 648                        if (rth->u.dst.expires) {
 649                                /* Entry is expired even if it is in use */
 650                                if (time_before_eq(now, rth->u.dst.expires)) {
 651                                        tmo >>= 1;
 652                                        rthp = &rth->u.rt_next;
 653                                        continue;
 654                                }
 655                        } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 656                                tmo >>= 1;
 657                                rthp = &rth->u.rt_next;
 658                                continue;
 659                        }
 660
 661                        /* Cleanup aged off entries. */
 662#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 663                        /* remove all related balanced entries if necessary */
 664                        if (rth->u.dst.flags & DST_BALANCED) {
 665                                rthp = rt_remove_balanced_route(
 666                                        &rt_hash_table[i].chain,
 667                                        rth, NULL);
 668                                if (!rthp)
 669                                        break;
 670                        } else {
 671                                *rthp = rth->u.rt_next;
 672                                rt_free(rth);
 673                        }
 674#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 675                        *rthp = rth->u.rt_next;
 676                        rt_free(rth);
 677#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 678                }
 679                spin_unlock(rt_hash_lock_addr(i));
 680
 681                /* Fallback loop breaker. */
 682                if (time_after(jiffies, now))
 683                        break;
 684        }
 685        rover = i;
 686        mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
 687}
 688
 689/* This can run from both BH and non-BH contexts, the latter
 690 * in the case of a forced flush event.
 691 */
 692static void rt_run_flush(unsigned long dummy)
 693{
 694        int i;
 695        struct rtable *rth, *next;
 696
 697        rt_deadline = 0;
 698
 699        get_random_bytes(&rt_hash_rnd, 4);
 700
 701        for (i = rt_hash_mask; i >= 0; i--) {
 702                spin_lock_bh(rt_hash_lock_addr(i));
 703                rth = rt_hash_table[i].chain;
 704                if (rth)
 705                        rt_hash_table[i].chain = NULL;
 706                spin_unlock_bh(rt_hash_lock_addr(i));
 707
 708                for (; rth; rth = next) {
 709                        next = rth->u.rt_next;
 710                        rt_free(rth);
 711                }
 712        }
 713}
 714
 715static DEFINE_SPINLOCK(rt_flush_lock);
 716
 717void rt_cache_flush(int delay)
 718{
 719        unsigned long now = jiffies;
 720        int user_mode = !in_softirq();
 721
 722        if (delay < 0)
 723                delay = ip_rt_min_delay;
 724
 725        /* flush existing multipath state*/
 726        multipath_flush();
 727
 728        spin_lock_bh(&rt_flush_lock);
 729
 730        if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
 731                long tmo = (long)(rt_deadline - now);
 732
 733                /* If flush timer is already running
 734                   and flush request is not immediate (delay > 0):
 735
 736                   if deadline is not achieved, prolongate timer to "delay",
 737                   otherwise fire it at deadline time.
 738                 */
 739
 740                if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
 741                        tmo = 0;
 742                
 743                if (delay > tmo)
 744                        delay = tmo;
 745        }
 746
 747        if (delay <= 0) {
 748                spin_unlock_bh(&rt_flush_lock);
 749                rt_run_flush(0);
 750                return;
 751        }
 752
 753        if (rt_deadline == 0)
 754                rt_deadline = now + ip_rt_max_delay;
 755
 756        mod_timer(&rt_flush_timer, now+delay);
 757        spin_unlock_bh(&rt_flush_lock);
 758}
 759
 760static void rt_secret_rebuild(unsigned long dummy)
 761{
 762        unsigned long now = jiffies;
 763
 764        rt_cache_flush(0);
 765        mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
 766}
 767
 768/*
 769   Short description of GC goals.
 770
 771   We want to build algorithm, which will keep routing cache
 772   at some equilibrium point, when number of aged off entries
 773   is kept approximately equal to newly generated ones.
 774
 775   Current expiration strength is variable "expire".
 776   We try to adjust it dynamically, so that if networking
 777   is idle expires is large enough to keep enough of warm entries,
 778   and when load increases it reduces to limit cache size.
 779 */
 780
 781static int rt_garbage_collect(void)
 782{
 783        static unsigned long expire = RT_GC_TIMEOUT;
 784        static unsigned long last_gc;
 785        static int rover;
 786        static int equilibrium;
 787        struct rtable *rth, **rthp;
 788        unsigned long now = jiffies;
 789        int goal;
 790
 791        /*
 792         * Garbage collection is pretty expensive,
 793         * do not make it too frequently.
 794         */
 795
 796        RT_CACHE_STAT_INC(gc_total);
 797
 798        if (now - last_gc < ip_rt_gc_min_interval &&
 799            atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 800                RT_CACHE_STAT_INC(gc_ignored);
 801                goto out;
 802        }
 803
 804        /* Calculate number of entries, which we want to expire now. */
 805        goal = atomic_read(&ipv4_dst_ops.entries) -
 806                (ip_rt_gc_elasticity << rt_hash_log);
 807        if (goal <= 0) {
 808                if (equilibrium < ipv4_dst_ops.gc_thresh)
 809                        equilibrium = ipv4_dst_ops.gc_thresh;
 810                goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 811                if (goal > 0) {
 812                        equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
 813                        goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 814                }
 815        } else {
 816                /* We are in dangerous area. Try to reduce cache really
 817                 * aggressively.
 818                 */
 819                goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
 820                equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 821        }
 822
 823        if (now - last_gc >= ip_rt_gc_min_interval)
 824                last_gc = now;
 825
 826        if (goal <= 0) {
 827                equilibrium += goal;
 828                goto work_done;
 829        }
 830
 831        do {
 832                int i, k;
 833
 834                for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 835                        unsigned long tmo = expire;
 836
 837                        k = (k + 1) & rt_hash_mask;
 838                        rthp = &rt_hash_table[k].chain;
 839                        spin_lock_bh(rt_hash_lock_addr(k));
 840                        while ((rth = *rthp) != NULL) {
 841                                if (!rt_may_expire(rth, tmo, expire)) {
 842                                        tmo >>= 1;
 843                                        rthp = &rth->u.rt_next;
 844                                        continue;
 845                                }
 846#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 847                                /* remove all related balanced entries
 848                                 * if necessary
 849                                 */
 850                                if (rth->u.dst.flags & DST_BALANCED) {
 851                                        int r;
 852
 853                                        rthp = rt_remove_balanced_route(
 854                                                &rt_hash_table[k].chain,
 855                                                rth,
 856                                                &r);
 857                                        goal -= r;
 858                                        if (!rthp)
 859                                                break;
 860                                } else {
 861                                        *rthp = rth->u.rt_next;
 862                                        rt_free(rth);
 863                                        goal--;
 864                                }
 865#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 866                                *rthp = rth->u.rt_next;
 867                                rt_free(rth);
 868                                goal--;
 869#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 870                        }
 871                        spin_unlock_bh(rt_hash_lock_addr(k));
 872                        if (goal <= 0)
 873                                break;
 874                }
 875                rover = k;
 876
 877                if (goal <= 0)
 878                        goto work_done;
 879
 880                /* Goal is not achieved. We stop process if:
 881
 882                   - if expire reduced to zero. Otherwise, expire is halfed.
 883                   - if table is not full.
 884                   - if we are called from interrupt.
 885                   - jiffies check is just fallback/debug loop breaker.
 886                     We will not spin here for long time in any case.
 887                 */
 888
 889                RT_CACHE_STAT_INC(gc_goal_miss);
 890
 891                if (expire == 0)
 892                        break;
 893
 894                expire >>= 1;
 895#if RT_CACHE_DEBUG >= 2
 896                printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 897                                atomic_read(&ipv4_dst_ops.entries), goal, i);
 898#endif
 899
 900                if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 901                        goto out;
 902        } while (!in_softirq() && time_before_eq(jiffies, now));
 903
 904        if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 905                goto out;
 906        if (net_ratelimit())
 907                printk(KERN_WARNING "dst cache overflow\n");
 908        RT_CACHE_STAT_INC(gc_dst_overflow);
 909        return 1;
 910
 911work_done:
 912        expire += ip_rt_gc_min_interval;
 913        if (expire > ip_rt_gc_timeout ||
 914            atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 915                expire = ip_rt_gc_timeout;
 916#if RT_CACHE_DEBUG >= 2
 917        printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 918                        atomic_read(&ipv4_dst_ops.entries), goal, rover);
 919#endif
 920out:    return 0;
 921}
 922
 923static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 924{
 925        struct rtable   *rth, **rthp;
 926        unsigned long   now;
 927        struct rtable *cand, **candp;
 928        u32             min_score;
 929        int             chain_length;
 930        int attempts = !in_softirq();
 931
 932restart:
 933        chain_length = 0;
 934        min_score = ~(u32)0;
 935        cand = NULL;
 936        candp = NULL;
 937        now = jiffies;
 938
 939        rthp = &rt_hash_table[hash].chain;
 940
 941        spin_lock_bh(rt_hash_lock_addr(hash));
 942        while ((rth = *rthp) != NULL) {
 943#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 944                if (!(rth->u.dst.flags & DST_BALANCED) &&
 945                    compare_keys(&rth->fl, &rt->fl)) {
 946#else
 947                if (compare_keys(&rth->fl, &rt->fl)) {
 948#endif
 949                        /* Put it first */
 950                        *rthp = rth->u.rt_next;
 951                        /*
 952                         * Since lookup is lockfree, the deletion
 953                         * must be visible to another weakly ordered CPU before
 954                         * the insertion at the start of the hash chain.
 955                         */
 956                        rcu_assign_pointer(rth->u.rt_next,
 957                                           rt_hash_table[hash].chain);
 958                        /*
 959                         * Since lookup is lockfree, the update writes
 960                         * must be ordered for consistency on SMP.
 961                         */
 962                        rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 963
 964                        rth->u.dst.__use++;
 965                        dst_hold(&rth->u.dst);
 966                        rth->u.dst.lastuse = now;
 967                        spin_unlock_bh(rt_hash_lock_addr(hash));
 968
 969                        rt_drop(rt);
 970                        *rp = rth;
 971                        return 0;
 972                }
 973
 974                if (!atomic_read(&rth->u.dst.__refcnt)) {
 975                        u32 score = rt_score(rth);
 976
 977                        if (score <= min_score) {
 978                                cand = rth;
 979                                candp = rthp;
 980                                min_score = score;
 981                        }
 982                }
 983
 984                chain_length++;
 985
 986                rthp = &rth->u.rt_next;
 987        }
 988
 989        if (cand) {
 990                /* ip_rt_gc_elasticity used to be average length of chain
 991                 * length, when exceeded gc becomes really aggressive.
 992                 *
 993                 * The second limit is less certain. At the moment it allows
 994                 * only 2 entries per bucket. We will see.
 995                 */
 996                if (chain_length > ip_rt_gc_elasticity) {
 997                        *candp = cand->u.rt_next;
 998                        rt_free(cand);
 999                }
1000        }
1001
1002        /* Try to bind route to arp only if it is output
1003           route or unicast forwarding path.
1004         */
1005        if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1006                int err = arp_bind_neighbour(&rt->u.dst);
1007                if (err) {
1008                        spin_unlock_bh(rt_hash_lock_addr(hash));
1009
1010                        if (err != -ENOBUFS) {
1011                                rt_drop(rt);
1012                                return err;
1013                        }
1014
1015                        /* Neighbour tables are full and nothing
1016                           can be released. Try to shrink route cache,
1017                           it is most likely it holds some neighbour records.
1018                         */
1019                        if (attempts-- > 0) {
1020                                int saved_elasticity = ip_rt_gc_elasticity;
1021                                int saved_int = ip_rt_gc_min_interval;
1022                                ip_rt_gc_elasticity     = 1;
1023                                ip_rt_gc_min_interval   = 0;
1024                                rt_garbage_collect();
1025                                ip_rt_gc_min_interval   = saved_int;
1026                                ip_rt_gc_elasticity     = saved_elasticity;
1027                                goto restart;
1028                        }
1029
1030                        if (net_ratelimit())
1031                                printk(KERN_WARNING "Neighbour table overflow.\n");
1032                        rt_drop(rt);
1033                        return -ENOBUFS;
1034                }
1035        }
1036
1037        rt->u.rt_next = rt_hash_table[hash].chain;
1038#if RT_CACHE_DEBUG >= 2
1039        if (rt->u.rt_next) {
1040                struct rtable *trt;
1041                printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1042                       NIPQUAD(rt->rt_dst));
1043                for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1044                        printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1045                printk("\n");
1046        }
1047#endif
1048        rt_hash_table[hash].chain = rt;
1049        spin_unlock_bh(rt_hash_lock_addr(hash));
1050        *rp = rt;
1051        return 0;
1052}
1053
1054void rt_bind_peer(struct rtable *rt, int create)
1055{
1056        static DEFINE_SPINLOCK(rt_peer_lock);
1057        struct inet_peer *peer;
1058
1059        peer = inet_getpeer(rt->rt_dst, create);
1060
1061        spin_lock_bh(&rt_peer_lock);
1062        if (rt->peer == NULL) {
1063                rt->peer = peer;
1064                peer = NULL;
1065        }
1066        spin_unlock_bh(&rt_peer_lock);
1067        if (peer)
1068                inet_putpeer(peer);
1069}
1070
1071/*
1072 * Peer allocation may fail only in serious out-of-memory conditions.  However
1073 * we still can generate some output.
1074 * Random ID selection looks a bit dangerous because we have no chances to
1075 * select ID being unique in a reasonable period of time.
1076 * But broken packet identifier may be better than no packet at all.
1077 */
1078static void ip_select_fb_ident(struct iphdr *iph)
1079{
1080        static DEFINE_SPINLOCK(ip_fb_id_lock);
1081        static u32 ip_fallback_id;
1082        u32 salt;
1083
1084        spin_lock_bh(&ip_fb_id_lock);
1085        salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1086        iph->id = htons(salt & 0xFFFF);
1087        ip_fallback_id = salt;
1088        spin_unlock_bh(&ip_fb_id_lock);
1089}
1090
1091void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1092{
1093        struct rtable *rt = (struct rtable *) dst;
1094
1095        if (rt) {
1096                if (rt->peer == NULL)
1097                        rt_bind_peer(rt, 1);
1098
1099                /* If peer is attached to destination, it is never detached,
1100                   so that we need not to grab a lock to dereference it.
1101                 */
1102                if (rt->peer) {
1103                        iph->id = htons(inet_getid(rt->peer, more));
1104                        return;
1105                }
1106        } else
1107                printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", 
1108                       __builtin_return_address(0));
1109
1110        ip_select_fb_ident(iph);
1111}
1112
1113static void rt_del(unsigned hash, struct rtable *rt)
1114{
1115        struct rtable **rthp;
1116
1117        spin_lock_bh(rt_hash_lock_addr(hash));
1118        ip_rt_put(rt);
1119        for (rthp = &rt_hash_table[hash].chain; *rthp;
1120             rthp = &(*rthp)->u.rt_next)
1121                if (*rthp == rt) {
1122                        *rthp = rt->u.rt_next;
1123                        rt_free(rt);
1124                        break;
1125                }
1126        spin_unlock_bh(rt_hash_lock_addr(hash));
1127}
1128
1129void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1130                    __be32 saddr, struct net_device *dev)
1131{
1132        int i, k;
1133        struct in_device *in_dev = in_dev_get(dev);
1134        struct rtable *rth, **rthp;
1135        __be32  skeys[2] = { saddr, 0 };
1136        int  ikeys[2] = { dev->ifindex, 0 };
1137        struct netevent_redirect netevent;
1138
1139        if (!in_dev)
1140                return;
1141
1142        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1143            || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1144                goto reject_redirect;
1145
1146        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1147                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1148                        goto reject_redirect;
1149                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1150                        goto reject_redirect;
1151        } else {
1152                if (inet_addr_type(new_gw) != RTN_UNICAST)
1153                        goto reject_redirect;
1154        }
1155
1156        for (i = 0; i < 2; i++) {
1157                for (k = 0; k < 2; k++) {
1158                        unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1159
1160                        rthp=&rt_hash_table[hash].chain;
1161
1162                        rcu_read_lock();
1163                        while ((rth = rcu_dereference(*rthp)) != NULL) {
1164                                struct rtable *rt;
1165
1166                                if (rth->fl.fl4_dst != daddr ||
1167                                    rth->fl.fl4_src != skeys[i] ||
1168                                    rth->fl.oif != ikeys[k] ||
1169                                    rth->fl.iif != 0) {
1170                                        rthp = &rth->u.rt_next;
1171                                        continue;
1172                                }
1173
1174                                if (rth->rt_dst != daddr ||
1175                                    rth->rt_src != saddr ||
1176                                    rth->u.dst.error ||
1177                                    rth->rt_gateway != old_gw ||
1178                                    rth->u.dst.dev != dev)
1179                                        break;
1180
1181                                dst_hold(&rth->u.dst);
1182                                rcu_read_unlock();
1183
1184                                rt = dst_alloc(&ipv4_dst_ops);
1185                                if (rt == NULL) {
1186                                        ip_rt_put(rth);
1187                                        in_dev_put(in_dev);
1188                                        return;
1189                                }
1190
1191                                /* Copy all the information. */
1192                                *rt = *rth;
1193                                INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1194                                rt->u.dst.__use         = 1;
1195                                atomic_set(&rt->u.dst.__refcnt, 1);
1196                                rt->u.dst.child         = NULL;
1197                                if (rt->u.dst.dev)
1198                                        dev_hold(rt->u.dst.dev);
1199                                if (rt->idev)
1200                                        in_dev_hold(rt->idev);
1201                                rt->u.dst.obsolete      = 0;
1202                                rt->u.dst.lastuse       = jiffies;
1203                                rt->u.dst.path          = &rt->u.dst;
1204                                rt->u.dst.neighbour     = NULL;
1205                                rt->u.dst.hh            = NULL;
1206                                rt->u.dst.xfrm          = NULL;
1207
1208                                rt->rt_flags            |= RTCF_REDIRECTED;
1209
1210                                /* Gateway is different ... */
1211                                rt->rt_gateway          = new_gw;
1212
1213                                /* Redirect received -> path was valid */
1214                                dst_confirm(&rth->u.dst);
1215
1216                                if (rt->peer)
1217                                        atomic_inc(&rt->peer->refcnt);
1218
1219                                if (arp_bind_neighbour(&rt->u.dst) ||
1220                                    !(rt->u.dst.neighbour->nud_state &
1221                                            NUD_VALID)) {
1222                                        if (rt->u.dst.neighbour)
1223                                                neigh_event_send(rt->u.dst.neighbour, NULL);
1224                                        ip_rt_put(rth);
1225                                        rt_drop(rt);
1226                                        goto do_next;
1227                                }
1228                                
1229                                netevent.old = &rth->u.dst;
1230                                netevent.new = &rt->u.dst;
1231                                call_netevent_notifiers(NETEVENT_REDIRECT, 
1232                                                        &netevent);
1233
1234                                rt_del(hash, rth);
1235                                if (!rt_intern_hash(hash, rt, &rt))
1236                                        ip_rt_put(rt);
1237                                goto do_next;
1238                        }
1239                        rcu_read_unlock();
1240                do_next:
1241                        ;
1242                }
1243        }
1244        in_dev_put(in_dev);
1245        return;
1246
1247reject_redirect:
1248#ifdef CONFIG_IP_ROUTE_VERBOSE
1249        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1250                printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1251                        "%u.%u.%u.%u ignored.\n"
1252                        "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1253                       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1254                       NIPQUAD(saddr), NIPQUAD(daddr));
1255#endif
1256        in_dev_put(in_dev);
1257}
1258
1259static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1260{
1261        struct rtable *rt = (struct rtable*)dst;
1262        struct dst_entry *ret = dst;
1263
1264        if (rt) {
1265                if (dst->obsolete) {
1266                        ip_rt_put(rt);
1267                        ret = NULL;
1268                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1269                           rt->u.dst.expires) {
1270                        unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1271                                                rt->fl.oif);
1272#if RT_CACHE_DEBUG >= 1
1273                        printk(KERN_DEBUG "ip_rt_advice: redirect to "
1274                                          "%u.%u.%u.%u/%02x dropped\n",
1275                                NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1276#endif
1277                        rt_del(hash, rt);
1278                        ret = NULL;
1279                }
1280        }
1281        return ret;
1282}
1283
1284/*
1285 * Algorithm:
1286 *      1. The first ip_rt_redirect_number redirects are sent
1287 *         with exponential backoff, then we stop sending them at all,
1288 *         assuming that the host ignores our redirects.
1289 *      2. If we did not see packets requiring redirects
1290 *         during ip_rt_redirect_silence, we assume that the host
1291 *         forgot redirected route and start to send redirects again.
1292 *
1293 * This algorithm is much cheaper and more intelligent than dumb load limiting
1294 * in icmp.c.
1295 *
1296 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1297 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1298 */
1299
1300void ip_rt_send_redirect(struct sk_buff *skb)
1301{
1302        struct rtable *rt = (struct rtable*)skb->dst;
1303        struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1304
1305        if (!in_dev)
1306                return;
1307
1308        if (!IN_DEV_TX_REDIRECTS(in_dev))
1309                goto out;
1310
1311        /* No redirected packets during ip_rt_redirect_silence;
1312         * reset the algorithm.
1313         */
1314        if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1315                rt->u.dst.rate_tokens = 0;
1316
1317        /* Too many ignored redirects; do not send anything
1318         * set u.dst.rate_last to the last seen redirected packet.
1319         */
1320        if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1321                rt->u.dst.rate_last = jiffies;
1322                goto out;
1323        }
1324
1325        /* Check for load limit; set rate_last to the latest sent
1326         * redirect.
1327         */
1328        if (rt->u.dst.rate_tokens == 0 ||
1329            time_after(jiffies,
1330                       (rt->u.dst.rate_last +
1331                        (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1332                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1333                rt->u.dst.rate_last = jiffies;
1334                ++rt->u.dst.rate_tokens;
1335#ifdef CONFIG_IP_ROUTE_VERBOSE
1336                if (IN_DEV_LOG_MARTIANS(in_dev) &&
1337                    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1338                    net_ratelimit())
1339                        printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1340                                "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1341                                NIPQUAD(rt->rt_src), rt->rt_iif,
1342                                NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1343#endif
1344        }
1345out:
1346        in_dev_put(in_dev);
1347}
1348
1349static int ip_error(struct sk_buff *skb)
1350{
1351        struct rtable *rt = (struct rtable*)skb->dst;
1352        unsigned long now;
1353        int code;
1354
1355        switch (rt->u.dst.error) {
1356                case EINVAL:
1357                default:
1358                        goto out;
1359                case EHOSTUNREACH:
1360                        code = ICMP_HOST_UNREACH;
1361                        break;
1362                case ENETUNREACH:
1363                        code = ICMP_NET_UNREACH;
1364                        break;
1365                case EACCES:
1366                        code = ICMP_PKT_FILTERED;
1367                        break;
1368        }
1369
1370        now = jiffies;
1371        rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1372        if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1373                rt->u.dst.rate_tokens = ip_rt_error_burst;
1374        rt->u.dst.rate_last = now;
1375        if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1376                rt->u.dst.rate_tokens -= ip_rt_error_cost;
1377                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1378        }
1379
1380out:    kfree_skb(skb);
1381        return 0;
1382} 
1383
1384/*
1385 *      The last two values are not from the RFC but
1386 *      are needed for AMPRnet AX.25 paths.
1387 */
1388
1389static const unsigned short mtu_plateau[] =
1390{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1391
1392static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1393{
1394        int i;
1395        
1396        for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1397                if (old_mtu > mtu_plateau[i])
1398                        return mtu_plateau[i];
1399        return 68;
1400}
1401
1402unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1403{
1404        int i;
1405        unsigned short old_mtu = ntohs(iph->tot_len);
1406        struct rtable *rth;
1407        __be32  skeys[2] = { iph->saddr, 0, };
1408        __be32  daddr = iph->daddr;
1409        unsigned short est_mtu = 0;
1410
1411        if (ipv4_config.no_pmtu_disc)
1412                return 0;
1413
1414        for (i = 0; i < 2; i++) {
1415                unsigned hash = rt_hash(daddr, skeys[i], 0);
1416
1417                rcu_read_lock();
1418                for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1419                     rth = rcu_dereference(rth->u.rt_next)) {
1420                        if (rth->fl.fl4_dst == daddr &&
1421                            rth->fl.fl4_src == skeys[i] &&
1422                            rth->rt_dst  == daddr &&
1423                            rth->rt_src  == iph->saddr &&
1424                            rth->fl.iif == 0 &&
1425                            !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1426                                unsigned short mtu = new_mtu;
1427
1428                                if (new_mtu < 68 || new_mtu >= old_mtu) {
1429
1430                                        /* BSD 4.2 compatibility hack :-( */
1431                                        if (mtu == 0 &&
1432                                            old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1433                                            old_mtu >= 68 + (iph->ihl << 2))
1434                                                old_mtu -= iph->ihl << 2;
1435
1436                                        mtu = guess_mtu(old_mtu);
1437                                }
1438                                if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1439                                        if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { 
1440                                                dst_confirm(&rth->u.dst);
1441                                                if (mtu < ip_rt_min_pmtu) {
1442                                                        mtu = ip_rt_min_pmtu;
1443                                                        rth->u.dst.metrics[RTAX_LOCK-1] |=
1444                                                                (1 << RTAX_MTU);
1445                                                }
1446                                                rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1447                                                dst_set_expires(&rth->u.dst,
1448                                                        ip_rt_mtu_expires);
1449                                        }
1450                                        est_mtu = mtu;
1451                                }
1452                        }
1453                }
1454                rcu_read_unlock();
1455        }
1456        return est_mtu ? : new_mtu;
1457}
1458
1459static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1460{
1461        if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1462            !(dst_metric_locked(dst, RTAX_MTU))) {
1463                if (mtu < ip_rt_min_pmtu) {
1464                        mtu = ip_rt_min_pmtu;
1465                        dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1466                }
1467                dst->metrics[RTAX_MTU-1] = mtu;
1468                dst_set_expires(dst, ip_rt_mtu_expires);
1469                call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1470        }
1471}
1472
1473static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1474{
1475        return NULL;
1476}
1477
1478static void ipv4_dst_destroy(struct dst_entry *dst)
1479{
1480        struct rtable *rt = (struct rtable *) dst;
1481        struct inet_peer *peer = rt->peer;
1482        struct in_device *idev = rt->idev;
1483
1484        if (peer) {
1485                rt->peer = NULL;
1486                inet_putpeer(peer);
1487        }
1488
1489        if (idev) {
1490                rt->idev = NULL;
1491                in_dev_put(idev);
1492        }
1493}
1494
1495static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1496                            int how)
1497{
1498        struct rtable *rt = (struct rtable *) dst;
1499        struct in_device *idev = rt->idev;
1500        if (dev != &loopback_dev && idev && idev->dev == dev) {
1501                struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1502                if (loopback_idev) {
1503                        rt->idev = loopback_idev;
1504                        in_dev_put(idev);
1505                }
1506        }
1507}
1508
1509static void ipv4_link_failure(struct sk_buff *skb)
1510{
1511        struct rtable *rt;
1512
1513        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1514
1515        rt = (struct rtable *) skb->dst;
1516        if (rt)
1517                dst_set_expires(&rt->u.dst, 0);
1518}
1519
1520static int ip_rt_bug(struct sk_buff *skb)
1521{
1522        printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1523                NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1524                skb->dev ? skb->dev->name : "?");
1525        kfree_skb(skb);
1526        return 0;
1527}
1528
1529/*
1530   We do not cache source address of outgoing interface,
1531   because it is used only by IP RR, TS and SRR options,
1532   so that it out of fast path.
1533
1534   BTW remember: "addr" is allowed to be not aligned
1535   in IP options!
1536 */
1537
1538void ip_rt_get_source(u8 *addr, struct rtable *rt)
1539{
1540        __be32 src;
1541        struct fib_result res;
1542
1543        if (rt->fl.iif == 0)
1544                src = rt->rt_src;
1545        else if (fib_lookup(&rt->fl, &res) == 0) {
1546                src = FIB_RES_PREFSRC(res);
1547                fib_res_put(&res);
1548        } else
1549                src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1550                                        RT_SCOPE_UNIVERSE);
1551        memcpy(addr, &src, 4);
1552}
1553
1554#ifdef CONFIG_NET_CLS_ROUTE
1555static void set_class_tag(struct rtable *rt, u32 tag)
1556{
1557        if (!(rt->u.dst.tclassid & 0xFFFF))
1558                rt->u.dst.tclassid |= tag & 0xFFFF;
1559        if (!(rt->u.dst.tclassid & 0xFFFF0000))
1560                rt->u.dst.tclassid |= tag & 0xFFFF0000;
1561}
1562#endif
1563
1564static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1565{
1566        struct fib_info *fi = res->fi;
1567
1568        if (fi) {
1569                if (FIB_RES_GW(*res) &&
1570                    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1571                        rt->rt_gateway = FIB_RES_GW(*res);
1572                memcpy(rt->u.dst.metrics, fi->fib_metrics,
1573                       sizeof(rt->u.dst.metrics));
1574                if (fi->fib_mtu == 0) {
1575                        rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1576                        if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1577                            rt->rt_gateway != rt->rt_dst &&
1578                            rt->u.dst.dev->mtu > 576)
1579                                rt->u.dst.metrics[RTAX_MTU-1] = 576;
1580                }
1581#ifdef CONFIG_NET_CLS_ROUTE
1582                rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1583#endif
1584        } else
1585                rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1586
1587        if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1588                rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1589        if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1590                rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1591        if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1592                rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1593                                       ip_rt_min_advmss);
1594        if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1595                rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1596
1597#ifdef CONFIG_NET_CLS_ROUTE
1598#ifdef CONFIG_IP_MULTIPLE_TABLES
1599        set_class_tag(rt, fib_rules_tclass(res));
1600#endif
1601        set_class_tag(rt, itag);
1602#endif
1603        rt->rt_type = res->type;
1604}
1605
1606static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1607                                u8 tos, struct net_device *dev, int our)
1608{
1609        unsigned hash;
1610        struct rtable *rth;
1611        __be32 spec_dst;
1612        struct in_device *in_dev = in_dev_get(dev);
1613        u32 itag = 0;
1614
1615        /* Primary sanity checks. */
1616
1617        if (in_dev == NULL)
1618                return -EINVAL;
1619
1620        if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1621            skb->protocol != htons(ETH_P_IP))
1622                goto e_inval;
1623
1624        if (ZERONET(saddr)) {
1625                if (!LOCAL_MCAST(daddr))
1626                        goto e_inval;
1627                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1628        } else if (fib_validate_source(saddr, 0, tos, 0,
1629                                        dev, &spec_dst, &itag) < 0)
1630                goto e_inval;
1631
1632        rth = dst_alloc(&ipv4_dst_ops);
1633        if (!rth)
1634                goto e_nobufs;
1635
1636        rth->u.dst.output= ip_rt_bug;
1637
1638        atomic_set(&rth->u.dst.__refcnt, 1);
1639        rth->u.dst.flags= DST_HOST;
1640        if (in_dev->cnf.no_policy)
1641                rth->u.dst.flags |= DST_NOPOLICY;
1642        rth->fl.fl4_dst = daddr;
1643        rth->rt_dst     = daddr;
1644        rth->fl.fl4_tos = tos;
1645        rth->fl.mark    = skb->mark;
1646        rth->fl.fl4_src = saddr;
1647        rth->rt_src     = saddr;
1648#ifdef CONFIG_NET_CLS_ROUTE
1649        rth->u.dst.tclassid = itag;
1650#endif
1651        rth->rt_iif     =
1652        rth->fl.iif     = dev->ifindex;
1653        rth->u.dst.dev  = &loopback_dev;
1654        dev_hold(rth->u.dst.dev);
1655        rth->idev       = in_dev_get(rth->u.dst.dev);
1656        rth->fl.oif     = 0;
1657        rth->rt_gateway = daddr;
1658        rth->rt_spec_dst= spec_dst;
1659        rth->rt_type    = RTN_MULTICAST;
1660        rth->rt_flags   = RTCF_MULTICAST;
1661        if (our) {
1662                rth->u.dst.input= ip_local_deliver;
1663                rth->rt_flags |= RTCF_LOCAL;
1664        }
1665
1666#ifdef CONFIG_IP_MROUTE
1667        if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1668                rth->u.dst.input = ip_mr_input;
1669#endif
1670        RT_CACHE_STAT_INC(in_slow_mc);
1671
1672        in_dev_put(in_dev);
1673        hash = rt_hash(daddr, saddr, dev->ifindex);
1674        return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1675
1676e_nobufs:
1677        in_dev_put(in_dev);
1678        return -ENOBUFS;
1679
1680e_inval:
1681        in_dev_put(in_dev);
1682        return -EINVAL;
1683}
1684
1685
1686static void ip_handle_martian_source(struct net_device *dev,
1687                                     struct in_device *in_dev,
1688                                     struct sk_buff *skb,
1689                                     __be32 daddr,
1690                                     __be32 saddr)
1691{
1692        RT_CACHE_STAT_INC(in_martian_src);
1693#ifdef CONFIG_IP_ROUTE_VERBOSE
1694        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1695                /*
1696                 *      RFC1812 recommendation, if source is martian,
1697                 *      the only hint is MAC header.
1698                 */
1699                printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1700                        "%u.%u.%u.%u, on dev %s\n",
1701                        NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1702                if (dev->hard_header_len && skb->mac.raw) {
1703                        int i;
1704                        unsigned char *p = skb->mac.raw;
1705                        printk(KERN_WARNING "ll header: ");
1706                        for (i = 0; i < dev->hard_header_len; i++, p++) {
1707                                printk("%02x", *p);
1708                                if (i < (dev->hard_header_len - 1))
1709                                        printk(":");
1710                        }
1711                        printk("\n");
1712                }
1713        }
1714#endif
1715}
1716
1717static inline int __mkroute_input(struct sk_buff *skb, 
1718                                  struct fib_result* res, 
1719                                  struct in_device *in_dev, 
1720                                  __be32 daddr, __be32 saddr, u32 tos,
1721                                  struct rtable **result) 
1722{
1723
1724        struct rtable *rth;
1725        int err;
1726        struct in_device *out_dev;
1727        unsigned flags = 0;
1728        __be32 spec_dst;
1729        u32 itag;
1730
1731        /* get a working reference to the output device */
1732        out_dev = in_dev_get(FIB_RES_DEV(*res));
1733        if (out_dev == NULL) {
1734                if (net_ratelimit())
1735                        printk(KERN_CRIT "Bug in ip_route_input" \
1736                               "_slow(). Please, report\n");
1737                return -EINVAL;
1738        }
1739
1740
1741        err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), 
1742                                  in_dev->dev, &spec_dst, &itag);
1743        if (err < 0) {
1744                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 
1745                                         saddr);
1746                
1747                err = -EINVAL;
1748                goto cleanup;
1749        }
1750
1751        if (err)
1752                flags |= RTCF_DIRECTSRC;
1753
1754        if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1755            (IN_DEV_SHARED_MEDIA(out_dev) ||
1756             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1757                flags |= RTCF_DOREDIRECT;
1758
1759        if (skb->protocol != htons(ETH_P_IP)) {
1760                /* Not IP (i.e. ARP). Do not create route, if it is
1761                 * invalid for proxy arp. DNAT routes are always valid.
1762                 */
1763                if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1764                        err = -EINVAL;
1765                        goto cleanup;
1766                }
1767        }
1768
1769
1770        rth = dst_alloc(&ipv4_dst_ops);
1771        if (!rth) {
1772                err = -ENOBUFS;
1773                goto cleanup;
1774        }
1775
1776        atomic_set(&rth->u.dst.__refcnt, 1);
1777        rth->u.dst.flags= DST_HOST;
1778#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1779        if (res->fi->fib_nhs > 1)
1780                rth->u.dst.flags |= DST_BALANCED;
1781#endif
1782        if (in_dev->cnf.no_policy)
1783                rth->u.dst.flags |= DST_NOPOLICY;
1784        if (out_dev->cnf.no_xfrm)
1785                rth->u.dst.flags |= DST_NOXFRM;
1786        rth->fl.fl4_dst = daddr;
1787        rth->rt_dst     = daddr;
1788        rth->fl.fl4_tos = tos;
1789        rth->fl.mark    = skb->mark;
1790        rth->fl.fl4_src = saddr;
1791        rth->rt_src     = saddr;
1792        rth->rt_gateway = daddr;
1793        rth->rt_iif     =
1794                rth->fl.iif     = in_dev->dev->ifindex;
1795        rth->u.dst.dev  = (out_dev)->dev;
1796        dev_hold(rth->u.dst.dev);
1797        rth->idev       = in_dev_get(rth->u.dst.dev);
1798        rth->fl.oif     = 0;
1799        rth->rt_spec_dst= spec_dst;
1800
1801        rth->u.dst.input = ip_forward;
1802        rth->u.dst.output = ip_output;
1803
1804        rt_set_nexthop(rth, res, itag);
1805
1806        rth->rt_flags = flags;
1807
1808        *result = rth;
1809        err = 0;
1810 cleanup:
1811        /* release the working reference to the output device */
1812        in_dev_put(out_dev);
1813        return err;
1814}                                               
1815
1816static inline int ip_mkroute_input_def(struct sk_buff *skb, 
1817                                       struct fib_result* res, 
1818                                       const struct flowi *fl,
1819                                       struct in_device *in_dev,
1820                                       __be32 daddr, __be32 saddr, u32 tos)
1821{
1822        struct rtable* rth = NULL;
1823        int err;
1824        unsigned hash;
1825
1826#ifdef CONFIG_IP_ROUTE_MULTIPATH
1827        if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1828                fib_select_multipath(fl, res);
1829#endif
1830
1831        /* create a routing cache entry */
1832        err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1833        if (err)
1834                return err;
1835
1836        /* put it into the cache */
1837        hash = rt_hash(daddr, saddr, fl->iif);
1838        return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);   
1839}
1840
1841static inline int ip_mkroute_input(struct sk_buff *skb, 
1842                                   struct fib_result* res, 
1843                                   const struct flowi *fl,
1844                                   struct in_device *in_dev,
1845                                   __be32 daddr, __be32 saddr, u32 tos)
1846{
1847#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1848        struct rtable* rth = NULL, *rtres;
1849        unsigned char hop, hopcount;
1850        int err = -EINVAL;
1851        unsigned int hash;
1852
1853        if (res->fi)
1854                hopcount = res->fi->fib_nhs;
1855        else
1856                hopcount = 1;
1857
1858        /* distinguish between multipath and singlepath */
1859        if (hopcount < 2)
1860                return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1861                                            saddr, tos);
1862        
1863        /* add all alternatives to the routing cache */
1864        for (hop = 0; hop < hopcount; hop++) {
1865                res->nh_sel = hop;
1866
1867                /* put reference to previous result */
1868                if (hop)
1869                        ip_rt_put(rtres);
1870
1871                /* create a routing cache entry */
1872                err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1873                                      &rth);
1874                if (err)
1875                        return err;
1876
1877                /* put it into the cache */
1878                hash = rt_hash(daddr, saddr, fl->iif);
1879                err = rt_intern_hash(hash, rth, &rtres);
1880                if (err)
1881                        return err;
1882
1883                /* forward hop information to multipath impl. */
1884                multipath_set_nhinfo(rth,
1885                                     FIB_RES_NETWORK(*res),
1886                                     FIB_RES_NETMASK(*res),
1887                                     res->prefixlen,
1888                                     &FIB_RES_NH(*res));
1889        }
1890        skb->dst = &rtres->u.dst;
1891        return err;
1892#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1893        return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1894#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1895}
1896
1897
1898/*
1899 *      NOTE. We drop all the packets that has local source
1900 *      addresses, because every properly looped back packet
1901 *      must have correct destination already attached by output routine.
1902 *
1903 *      Such approach solves two big problems:
1904 *      1. Not simplex devices are handled properly.
1905 *      2. IP spoofing attempts are filtered with 100% of guarantee.
1906 */
1907
1908static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1909                               u8 tos, struct net_device *dev)
1910{
1911        struct fib_result res;
1912        struct in_device *in_dev = in_dev_get(dev);
1913        struct flowi fl = { .nl_u = { .ip4_u =
1914                                      { .daddr = daddr,
1915                                        .saddr = saddr,
1916                                        .tos = tos,
1917                                        .scope = RT_SCOPE_UNIVERSE,
1918                                      } },
1919                            .mark = skb->mark,
1920                            .iif = dev->ifindex };
1921        unsigned        flags = 0;
1922        u32             itag = 0;
1923        struct rtable * rth;
1924        unsigned        hash;
1925        __be32          spec_dst;
1926        int             err = -EINVAL;
1927        int             free_res = 0;
1928
1929        /* IP on this device is disabled. */
1930
1931        if (!in_dev)
1932                goto out;
1933
1934        /* Check for the most weird martians, which can be not detected
1935           by fib_lookup.
1936         */
1937
1938        if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1939                goto martian_source;
1940
1941        if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1942                goto brd_input;
1943
1944        /* Accept zero addresses only to limited broadcast;
1945         * I even do not know to fix it or not. Waiting for complains :-)
1946         */
1947        if (ZERONET(saddr))
1948                goto martian_source;
1949
1950        if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1951                goto martian_destination;
1952
1953        /*
1954         *      Now we are ready to route packet.
1955         */
1956        if ((err = fib_lookup(&fl, &res)) != 0) {
1957                if (!IN_DEV_FORWARD(in_dev))
1958                        goto e_hostunreach;
1959                goto no_route;
1960        }
1961        free_res = 1;
1962
1963        RT_CACHE_STAT_INC(in_slow_tot);
1964
1965        if (res.type == RTN_BROADCAST)
1966                goto brd_input;
1967
1968        if (res.type == RTN_LOCAL) {
1969                int result;
1970                result = fib_validate_source(saddr, daddr, tos,
1971                                             loopback_dev.ifindex,
1972                                             dev, &spec_dst, &itag);
1973                if (result < 0)
1974                        goto martian_source;
1975                if (result)
1976                        flags |= RTCF_DIRECTSRC;
1977                spec_dst = daddr;
1978                goto local_input;
1979        }
1980
1981        if (!IN_DEV_FORWARD(in_dev))
1982                goto e_hostunreach;
1983        if (res.type != RTN_UNICAST)
1984                goto martian_destination;
1985
1986        err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1987        if (err == -ENOBUFS)
1988                goto e_nobufs;
1989        if (err == -EINVAL)
1990                goto e_inval;
1991        
1992done:
1993        in_dev_put(in_dev);
1994        if (free_res)
1995                fib_res_put(&res);
1996out:    return err;
1997
1998brd_input:
1999        if (skb->protocol != htons(ETH_P_IP))
2000                goto e_inval;
2001
2002        if (ZERONET(saddr))
2003                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2004        else {
2005                err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2006                                          &itag);
2007                if (err < 0)
2008                        goto martian_source;
2009                if (err)
2010                        flags |= RTCF_DIRECTSRC;
2011        }
2012        flags |= RTCF_BROADCAST;
2013        res.type = RTN_BROADCAST;
2014        RT_CACHE_STAT_INC(in_brd);
2015
2016local_input:
2017        rth = dst_alloc(&ipv4_dst_ops);
2018        if (!rth)
2019                goto e_nobufs;
2020
2021        rth->u.dst.output= ip_rt_bug;
2022
2023        atomic_set(&rth->u.dst.__refcnt, 1);
2024        rth->u.dst.flags= DST_HOST;
2025        if (in_dev->cnf.no_policy)
2026                rth->u.dst.flags |= DST_NOPOLICY;
2027        rth->fl.fl4_dst = daddr;
2028        rth->rt_dst     = daddr;
2029        rth->fl.fl4_tos = tos;
2030        rth->fl.mark    = skb->mark;
2031        rth->fl.fl4_src = saddr;
2032        rth->rt_src     = saddr;
2033#ifdef CONFIG_NET_CLS_ROUTE
2034        rth->u.dst.tclassid = itag;
2035#endif
2036        rth->rt_iif     =
2037        rth->fl.iif     = dev->ifindex;
2038        rth->u.dst.dev  = &loopback_dev;
2039        dev_hold(rth->u.dst.dev);
2040        rth->idev       = in_dev_get(rth->u.dst.dev);
2041        rth->rt_gateway = daddr;
2042        rth->rt_spec_dst= spec_dst;
2043        rth->u.dst.input= ip_local_deliver;
2044        rth->rt_flags   = flags|RTCF_LOCAL;
2045        if (res.type == RTN_UNREACHABLE) {
2046                rth->u.dst.input= ip_error;
2047                rth->u.dst.error= -err;
2048                rth->rt_flags   &= ~RTCF_LOCAL;
2049        }
2050        rth->rt_type    = res.type;
2051        hash = rt_hash(daddr, saddr, fl.iif);
2052        err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2053        goto done;
2054
2055no_route:
2056        RT_CACHE_STAT_INC(in_no_route);
2057        spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2058        res.type = RTN_UNREACHABLE;
2059        goto local_input;
2060
2061        /*
2062         *      Do not cache martian addresses: they should be logged (RFC1812)
2063         */
2064martian_destination:
2065        RT_CACHE_STAT_INC(in_martian_dst);
2066#ifdef CONFIG_IP_ROUTE_VERBOSE
2067        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2068                printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2069                        "%u.%u.%u.%u, dev %s\n",
2070                        NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2071#endif
2072
2073e_hostunreach:
2074        err = -EHOSTUNREACH;
2075        goto done;
2076
2077e_inval:
2078        err = -EINVAL;
2079        goto done;
2080
2081e_nobufs:
2082        err = -ENOBUFS;
2083        goto done;
2084
2085martian_source:
2086        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2087        goto e_inval;
2088}
2089
2090int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2091                   u8 tos, struct net_device *dev)
2092{
2093        struct rtable * rth;
2094        unsigned        hash;
2095        int iif = dev->ifindex;
2096
2097        tos &= IPTOS_RT_MASK;
2098        hash = rt_hash(daddr, saddr, iif);
2099
2100        rcu_read_lock();
2101        for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2102             rth = rcu_dereference(rth->u.rt_next)) {
2103                if (rth->fl.fl4_dst == daddr &&
2104                    rth->fl.fl4_src == saddr &&
2105                    rth->fl.iif == iif &&
2106                    rth->fl.oif == 0 &&
2107                    rth->fl.mark == skb->mark &&
2108                    rth->fl.fl4_tos == tos) {
2109                        rth->u.dst.lastuse = jiffies;
2110                        dst_hold(&rth->u.dst);
2111                        rth->u.dst.__use++;
2112                        RT_CACHE_STAT_INC(in_hit);
2113                        rcu_read_unlock();
2114                        skb->dst = (struct dst_entry*)rth;
2115                        return 0;
2116                }
2117                RT_CACHE_STAT_INC(in_hlist_search);
2118        }
2119        rcu_read_unlock();
2120
2121        /* Multicast recognition logic is moved from route cache to here.
2122           The problem was that too many Ethernet cards have broken/missing
2123           hardware multicast filters :-( As result the host on multicasting
2124           network acquires a lot of useless route cache entries, sort of
2125           SDR messages from all the world. Now we try to get rid of them.
2126           Really, provided software IP multicast filter is organized
2127           reasonably (at least, hashed), it does not result in a slowdown
2128           comparing with route cache reject entries.
2129           Note, that multicast routers are not affected, because
2130           route cache entry is created eventually.
2131         */
2132        if (MULTICAST(daddr)) {
2133                struct in_device *in_dev;
2134
2135                rcu_read_lock();
2136                if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2137                        int our = ip_check_mc(in_dev, daddr, saddr,
2138                                skb->nh.iph->protocol);
2139                        if (our
2140#ifdef CONFIG_IP_MROUTE
2141                            || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2142#endif
2143                            ) {
2144                                rcu_read_unlock();
2145                                return ip_route_input_mc(skb, daddr, saddr,
2146                                                         tos, dev, our);
2147                        }
2148                }
2149                rcu_read_unlock();
2150                return -EINVAL;
2151        }
2152        return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2153}
2154
2155static inline int __mkroute_output(struct rtable **result,
2156                                   struct fib_result* res, 
2157                                   const struct flowi *fl,
2158                                   const struct flowi *oldflp, 
2159                                   struct net_device *dev_out, 
2160                                   unsigned flags) 
2161{
2162        struct rtable *rth;
2163        struct in_device *in_dev;
2164        u32 tos = RT_FL_TOS(oldflp);
2165        int err = 0;
2166
2167        if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2168                return -EINVAL;
2169
2170        if (fl->fl4_dst == htonl(0xFFFFFFFF))
2171                res->type = RTN_BROADCAST;
2172        else if (MULTICAST(fl->fl4_dst))
2173                res->type = RTN_MULTICAST;
2174        else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2175                return -EINVAL;
2176
2177        if (dev_out->flags & IFF_LOOPBACK)
2178                flags |= RTCF_LOCAL;
2179
2180        /* get work reference to inet device */
2181        in_dev = in_dev_get(dev_out);
2182        if (!in_dev)
2183                return -EINVAL;
2184
2185        if (res->type == RTN_BROADCAST) {
2186                flags |= RTCF_BROADCAST | RTCF_LOCAL;
2187                if (res->fi) {
2188                        fib_info_put(res->fi);
2189                        res->fi = NULL;
2190                }
2191        } else if (res->type == RTN_MULTICAST) {
2192                flags |= RTCF_MULTICAST|RTCF_LOCAL;
2193                if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, 
2194                                 oldflp->proto))
2195                        flags &= ~RTCF_LOCAL;
2196                /* If multicast route do not exist use
2197                   default one, but do not gateway in this case.
2198                   Yes, it is hack.
2199                 */
2200                if (res->fi && res->prefixlen < 4) {
2201                        fib_info_put(res->fi);
2202                        res->fi = NULL;
2203                }
2204        }
2205
2206
2207        rth = dst_alloc(&ipv4_dst_ops);
2208        if (!rth) {
2209                err = -ENOBUFS;
2210                goto cleanup;
2211        }               
2212
2213        atomic_set(&rth->u.dst.__refcnt, 1);
2214        rth->u.dst.flags= DST_HOST;
2215#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2216        if (res->fi) {
2217                rth->rt_multipath_alg = res->fi->fib_mp_alg;
2218                if (res->fi->fib_nhs > 1)
2219                        rth->u.dst.flags |= DST_BALANCED;
2220        }
2221#endif
2222        if (in_dev->cnf.no_xfrm)
2223                rth->u.dst.flags |= DST_NOXFRM;
2224        if (in_dev->cnf.no_policy)
2225                rth->u.dst.flags |= DST_NOPOLICY;
2226
2227        rth->fl.fl4_dst = oldflp->fl4_dst;
2228        rth->fl.fl4_tos = tos;
2229        rth->fl.fl4_src = oldflp->fl4_src;
2230        rth->fl.oif     = oldflp->oif;
2231        rth->fl.mark    = oldflp->mark;
2232        rth->rt_dst     = fl->fl4_dst;
2233        rth->rt_src     = fl->fl4_src;
2234        rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2235        /* get references to the devices that are to be hold by the routing 
2236           cache entry */
2237        rth->u.dst.dev  = dev_out;
2238        dev_hold(dev_out);
2239        rth->idev       = in_dev_get(dev_out);
2240        rth->rt_gateway = fl->fl4_dst;
2241        rth->rt_spec_dst= fl->fl4_src;
2242
2243        rth->u.dst.output=ip_output;
2244
2245        RT_CACHE_STAT_INC(out_slow_tot);
2246
2247        if (flags & RTCF_LOCAL) {
2248                rth->u.dst.input = ip_local_deliver;
2249                rth->rt_spec_dst = fl->fl4_dst;
2250        }
2251        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2252                rth->rt_spec_dst = fl->fl4_src;
2253                if (flags & RTCF_LOCAL && 
2254                    !(dev_out->flags & IFF_LOOPBACK)) {
2255                        rth->u.dst.output = ip_mc_output;
2256                        RT_CACHE_STAT_INC(out_slow_mc);
2257                }
2258#ifdef CONFIG_IP_MROUTE
2259                if (res->type == RTN_MULTICAST) {
2260                        if (IN_DEV_MFORWARD(in_dev) &&
2261                            !LOCAL_MCAST(oldflp->fl4_dst)) {
2262                                rth->u.dst.input = ip_mr_input;
2263                                rth->u.dst.output = ip_mc_output;
2264                        }
2265                }
2266#endif
2267        }
2268
2269        rt_set_nexthop(rth, res, 0);
2270
2271        rth->rt_flags = flags;
2272
2273        *result = rth;
2274 cleanup:
2275        /* release work reference to inet device */
2276        in_dev_put(in_dev);
2277
2278        return err;
2279}
2280
2281static inline int ip_mkroute_output_def(struct rtable **rp,
2282                                        struct fib_result* res,
2283                                        const struct flowi *fl,
2284                                        const struct flowi *oldflp,
2285                                        struct net_device *dev_out,
2286                                        unsigned flags)
2287{
2288        struct rtable *rth = NULL;
2289        int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2290        unsigned hash;
2291        if (err == 0) {
2292                hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2293                err = rt_intern_hash(hash, rth, rp);
2294        }
2295        
2296        return err;
2297}
2298
2299static inline int ip_mkroute_output(struct rtable** rp,
2300                                    struct fib_result* res,
2301                                    const struct flowi *fl,
2302                                    const struct flowi *oldflp,
2303                                    struct net_device *dev_out,
2304                                    unsigned flags)
2305{
2306#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2307        unsigned char hop;
2308        unsigned hash;
2309        int err = -EINVAL;
2310        struct rtable *rth = NULL;
2311
2312        if (res->fi && res->fi->fib_nhs > 1) {
2313                unsigned char hopcount = res->fi->fib_nhs;
2314
2315                for (hop = 0; hop < hopcount; hop++) {
2316                        struct net_device *dev2nexthop;
2317
2318                        res->nh_sel = hop;
2319
2320                        /* hold a work reference to the output device */
2321                        dev2nexthop = FIB_RES_DEV(*res);
2322                        dev_hold(dev2nexthop);
2323
2324                        /* put reference to previous result */
2325                        if (hop)
2326                                ip_rt_put(*rp);
2327
2328                        err = __mkroute_output(&rth, res, fl, oldflp,
2329                                               dev2nexthop, flags);
2330
2331                        if (err != 0)
2332                                goto cleanup;
2333
2334                        hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src,
2335                                        oldflp->oif);
2336                        err = rt_intern_hash(hash, rth, rp);
2337
2338                        /* forward hop information to multipath impl. */
2339                        multipath_set_nhinfo(rth,
2340                                             FIB_RES_NETWORK(*res),
2341                                             FIB_RES_NETMASK(*res),
2342                                             res->prefixlen,
2343                                             &FIB_RES_NH(*res));
2344                cleanup:
2345                        /* release work reference to output device */
2346                        dev_put(dev2nexthop);
2347
2348                        if (err != 0)
2349                                return err;
2350                }
2351                return err;
2352        } else {
2353                return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2354                                             flags);
2355        }
2356#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2357        return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2358#endif
2359}
2360
2361/*
2362 * Major route resolver routine.
2363 */
2364
2365static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2366{
2367        u32 tos = RT_FL_TOS(oldflp);
2368        struct flowi fl = { .nl_u = { .ip4_u =
2369                                      { .daddr = oldflp->fl4_dst,
2370                                        .saddr = oldflp->fl4_src,
2371                                        .tos = tos & IPTOS_RT_MASK,
2372                                        .scope = ((tos & RTO_ONLINK) ?
2373                                                  RT_SCOPE_LINK :
2374                                                  RT_SCOPE_UNIVERSE),
2375                                      } },
2376                            .mark = oldflp->mark,
2377                            .iif = loopback_dev.ifindex,
2378                            .oif = oldflp->oif };
2379        struct fib_result res;
2380        unsigned flags = 0;
2381        struct net_device *dev_out = NULL;
2382        int free_res = 0;
2383        int err;
2384
2385
2386        res.fi          = NULL;
2387#ifdef CONFIG_IP_MULTIPLE_TABLES
2388        res.r           = NULL;
2389#endif
2390
2391        if (oldflp->fl4_src) {
2392                err = -EINVAL;
2393                if (MULTICAST(oldflp->fl4_src) ||
2394                    BADCLASS(oldflp->fl4_src) ||
2395                    ZERONET(oldflp->fl4_src))
2396                        goto out;
2397
2398                /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2399                dev_out = ip_dev_find(oldflp->fl4_src);
2400                if (dev_out == NULL)
2401                        goto out;
2402
2403                /* I removed check for oif == dev_out->oif here.
2404                   It was wrong for two reasons:
2405                   1. ip_dev_find(saddr) can return wrong iface, if saddr is
2406                      assigned to multiple interfaces.
2407                   2. Moreover, we are allowed to send packets with saddr
2408                      of another iface. --ANK
2409                 */
2410
2411                if (oldflp->oif == 0
2412                    && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2413                        /* Special hack: user can direct multicasts
2414                           and limited broadcast via necessary interface
2415                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2416                           This hack is not just for fun, it allows
2417                           vic,vat and friends to work.
2418                           They bind socket to loopback, set ttl to zero
2419                           and expect that it will work.
2420                           From the viewpoint of routing cache they are broken,
2421                           because we are not allowed to build multicast path
2422                           with loopback source addr (look, routing cache
2423                           cannot know, that ttl is zero, so that packet
2424                           will not leave this host and route is valid).
2425                           Luckily, this hack is good workaround.
2426                         */
2427
2428                        fl.oif = dev_out->ifindex;
2429                        goto make_route;
2430                }
2431                if (dev_out)
2432                        dev_put(dev_out);
2433                dev_out = NULL;
2434        }
2435
2436
2437        if (oldflp->oif) {
2438                dev_out = dev_get_by_index(oldflp->oif);
2439                err = -ENODEV;
2440                if (dev_out == NULL)
2441                        goto out;
2442
2443                /* RACE: Check return value of inet_select_addr instead. */
2444                if (__in_dev_get_rtnl(dev_out) == NULL) {
2445                        dev_put(dev_out);
2446                        goto out;       /* Wrong error code */
2447                }
2448
2449                if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2450                        if (!fl.fl4_src)
2451                                fl.fl4_src = inet_select_addr(dev_out, 0,
2452                                                              RT_SCOPE_LINK);
2453                        goto make_route;
2454                }
2455                if (!fl.fl4_src) {
2456                        if (MULTICAST(oldflp->fl4_dst))
2457                                fl.fl4_src = inet_select_addr(dev_out, 0,
2458                                                              fl.fl4_scope);
2459                        else if (!oldflp->fl4_dst)
2460                                fl.fl4_src = inet_select_addr(dev_out, 0,
2461                                                              RT_SCOPE_HOST);
2462                }
2463        }
2464
2465        if (!fl.fl4_dst) {
2466                fl.fl4_dst = fl.fl4_src;
2467                if (!fl.fl4_dst)
2468                        fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2469                if (dev_out)
2470                        dev_put(dev_out);
2471                dev_out = &loopback_dev;
2472                dev_hold(dev_out);
2473                fl.oif = loopback_dev.ifindex;
2474                res.type = RTN_LOCAL;
2475                flags |= RTCF_LOCAL;
2476                goto make_route;
2477        }
2478
2479        if (fib_lookup(&fl, &res)) {
2480                res.fi = NULL;
2481                if (oldflp->oif) {
2482                        /* Apparently, routing tables are wrong. Assume,
2483                           that the destination is on link.
2484
2485                           WHY? DW.
2486                           Because we are allowed to send to iface
2487                           even if it has NO routes and NO assigned
2488                           addresses. When oif is specified, routing
2489                           tables are looked up with only one purpose:
2490                           to catch if destination is gatewayed, rather than
2491                           direct. Moreover, if MSG_DONTROUTE is set,
2492                           we send packet, ignoring both routing tables
2493                           and ifaddr state. --ANK
2494
2495
2496                           We could make it even if oif is unknown,
2497                           likely IPv6, but we do not.
2498                         */
2499
2500                        if (fl.fl4_src == 0)
2501                                fl.fl4_src = inet_select_addr(dev_out, 0,
2502                                                              RT_SCOPE_LINK);
2503                        res.type = RTN_UNICAST;
2504                        goto make_route;
2505                }
2506                if (dev_out)
2507                        dev_put(dev_out);
2508                err = -ENETUNREACH;
2509                goto out;
2510        }
2511        free_res = 1;
2512
2513        if (res.type == RTN_LOCAL) {
2514                if (!fl.fl4_src)
2515                        fl.fl4_src = fl.fl4_dst;
2516                if (dev_out)
2517                        dev_put(dev_out);
2518                dev_out = &loopback_dev;
2519                dev_hold(dev_out);
2520                fl.oif = dev_out->ifindex;
2521                if (res.fi)
2522                        fib_info_put(res.fi);
2523                res.fi = NULL;
2524                flags |= RTCF_LOCAL;
2525                goto make_route;
2526        }
2527
2528#ifdef CONFIG_IP_ROUTE_MULTIPATH
2529        if (res.fi->fib_nhs > 1 && fl.oif == 0)
2530                fib_select_multipath(&fl, &res);
2531        else
2532#endif
2533        if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2534                fib_select_default(&fl, &res);
2535
2536        if (!fl.fl4_src)
2537                fl.fl4_src = FIB_RES_PREFSRC(res);
2538
2539        if (dev_out)
2540                dev_put(dev_out);
2541        dev_out = FIB_RES_DEV(res);
2542        dev_hold(dev_out);
2543        fl.oif = dev_out->ifindex;
2544
2545
2546make_route:
2547        err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2548
2549
2550        if (free_res)
2551                fib_res_put(&res);
2552        if (dev_out)
2553                dev_put(dev_out);
2554out:    return err;
2555}
2556
2557int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2558{
2559        unsigned hash;
2560        struct rtable *rth;
2561
2562        hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2563
2564        rcu_read_lock_bh();
2565        for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2566                rth = rcu_dereference(rth->u.rt_next)) {
2567                if (rth->fl.fl4_dst == flp->fl4_dst &&
2568                    rth->fl.fl4_src == flp->fl4_src &&
2569                    rth->fl.iif == 0 &&
2570                    rth->fl.oif == flp->oif &&
2571                    rth->fl.mark == flp->mark &&
2572                    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2573                            (IPTOS_RT_MASK | RTO_ONLINK))) {
2574
2575                        /* check for multipath routes and choose one if
2576                         * necessary
2577                         */
2578                        if (multipath_select_route(flp, rth, rp)) {
2579                                dst_hold(&(*rp)->u.dst);
2580                                RT_CACHE_STAT_INC(out_hit);
2581                                rcu_read_unlock_bh();
2582                                return 0;
2583                        }
2584
2585                        rth->u.dst.lastuse = jiffies;
2586                        dst_hold(&rth->u.dst);
2587                        rth->u.dst.__use++;
2588                        RT_CACHE_STAT_INC(out_hit);
2589                        rcu_read_unlock_bh();
2590                        *rp = rth;
2591                        return 0;
2592                }
2593                RT_CACHE_STAT_INC(out_hlist_search);
2594        }
2595        rcu_read_unlock_bh();
2596
2597        return ip_route_output_slow(rp, flp);
2598}
2599
2600EXPORT_SYMBOL_GPL(__ip_route_output_key);
2601
2602int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2603{
2604        int err;
2605
2606        if ((err = __ip_route_output_key(rp, flp)) != 0)
2607                return err;
2608
2609        if (flp->proto) {
2610                if (!flp->fl4_src)
2611                        flp->fl4_src = (*rp)->rt_src;
2612                if (!flp->fl4_dst)
2613                        flp->fl4_dst = (*rp)->rt_dst;
2614                return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2615        }
2616
2617        return 0;
2618}
2619
2620EXPORT_SYMBOL_GPL(ip_route_output_flow);
2621
2622int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2623{
2624        return ip_route_output_flow(rp, flp, NULL, 0);
2625}
2626
2627static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2628                        int nowait, unsigned int flags)
2629{
2630        struct rtable *rt = (struct rtable*)skb->dst;
2631        struct rtmsg *r;
2632        struct nlmsghdr *nlh;
2633        long expires;
2634        u32 id = 0, ts = 0, tsage = 0, error;
2635
2636        nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2637        if (nlh == NULL)
2638                return -ENOBUFS;
2639
2640        r = nlmsg_data(nlh);
2641        r->rtm_family    = AF_INET;
2642        r->rtm_dst_len  = 32;
2643        r->rtm_src_len  = 0;
2644        r->rtm_tos      = rt->fl.fl4_tos;
2645        r->rtm_table    = RT_TABLE_MAIN;
2646        NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2647        r->rtm_type     = rt->rt_type;
2648        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2649        r->rtm_protocol = RTPROT_UNSPEC;
2650        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2651        if (rt->rt_flags & RTCF_NOTIFY)
2652                r->rtm_flags |= RTM_F_NOTIFY;
2653
2654        NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2655
2656        if (rt->fl.fl4_src) {
2657                r->rtm_src_len = 32;
2658                NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2659        }
2660        if (rt->u.dst.dev)
2661                NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2662#ifdef CONFIG_NET_CLS_ROUTE
2663        if (rt->u.dst.tclassid)
2664                NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2665#endif
2666#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2667        if (rt->rt_multipath_alg != IP_MP_ALG_NONE)
2668                NLA_PUT_U32(skb, RTA_MP_ALGO, rt->rt_multipath_alg);
2669#endif
2670        if (rt->fl.iif)
2671                NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2672        else if (rt->rt_src != rt->fl.fl4_src)
2673                NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2674
2675        if (rt->rt_dst != rt->rt_gateway)
2676                NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2677
2678        if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2679                goto nla_put_failure;
2680
2681        error = rt->u.dst.error;
2682        expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2683        if (rt->peer) {
2684                id = rt->peer->ip_id_count;
2685                if (rt->peer->tcp_ts_stamp) {
2686                        ts = rt->peer->tcp_ts;
2687                        tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2688                }
2689        }
2690
2691        if (rt->fl.iif) {
2692#ifdef CONFIG_IP_MROUTE
2693                __be32 dst = rt->rt_dst;
2694
2695                if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2696                    ipv4_devconf.mc_forwarding) {
2697                        int err = ipmr_get_route(skb, r, nowait);
2698                        if (err <= 0) {
2699                                if (!nowait) {
2700                                        if (err == 0)
2701                                                return 0;
2702                                        goto nla_put_failure;
2703                                } else {
2704                                        if (err == -EMSGSIZE)
2705                                                goto nla_put_failure;
2706                                        error = err;
2707                                }
2708                        }
2709                } else
2710#endif
2711                        NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2712        }
2713
2714        if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2715                               expires, error) < 0)
2716                goto nla_put_failure;
2717
2718        return nlmsg_end(skb, nlh);
2719
2720nla_put_failure:
2721        return nlmsg_cancel(skb, nlh);
2722}
2723
2724int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2725{
2726        struct rtmsg *rtm;
2727        struct nlattr *tb[RTA_MAX+1];
2728        struct rtable *rt = NULL;
2729        __be32 dst = 0;
2730        __be32 src = 0;
2731        u32 iif;
2732        int err;
2733        struct sk_buff *skb;
2734
2735        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2736        if (err < 0)
2737                goto errout;
2738
2739        rtm = nlmsg_data(nlh);
2740
2741        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2742        if (skb == NULL) {
2743                err = -ENOBUFS;
2744                goto errout;
2745        }
2746
2747        /* Reserve room for dummy headers, this skb can pass
2748           through good chunk of routing engine.
2749         */
2750        skb->mac.raw = skb->nh.raw = skb->data;
2751
2752        /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2753        skb->nh.iph->protocol = IPPROTO_ICMP;
2754        skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2755
2756        src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2757        dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2758        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2759
2760        if (iif) {
2761                struct net_device *dev;
2762
2763                dev = __dev_get_by_index(iif);
2764                if (dev == NULL) {
2765                        err = -ENODEV;
2766                        goto errout_free;
2767                }
2768
2769                skb->protocol   = htons(ETH_P_IP);
2770                skb->dev        = dev;
2771                local_bh_disable();
2772                err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2773                local_bh_enable();
2774
2775                rt = (struct rtable*) skb->dst;
2776                if (err == 0 && rt->u.dst.error)
2777                        err = -rt->u.dst.error;
2778        } else {
2779                struct flowi fl = {
2780                        .nl_u = {
2781                                .ip4_u = {
2782                                        .daddr = dst,
2783                                        .saddr = src,
2784                                        .tos = rtm->rtm_tos,
2785                                },
2786                        },
2787                        .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2788                };
2789                err = ip_route_output_key(&rt, &fl);
2790        }
2791
2792        if (err)
2793                goto errout_free;
2794
2795        skb->dst = &rt->u.dst;
2796        if (rtm->rtm_flags & RTM_F_NOTIFY)
2797                rt->rt_flags |= RTCF_NOTIFY;
2798
2799        err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2800                                RTM_NEWROUTE, 0, 0);
2801        if (err <= 0)
2802                goto errout_free;
2803
2804        err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2805errout:
2806        return err;
2807
2808errout_free:
2809        kfree_skb(skb);
2810        goto errout;
2811}
2812
2813int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2814{
2815        struct rtable *rt;
2816        int h, s_h;
2817        int idx, s_idx;
2818
2819        s_h = cb->args[0];
2820        s_idx = idx = cb->args[1];
2821        for (h = 0; h <= rt_hash_mask; h++) {
2822                if (h < s_h) continue;
2823                if (h > s_h)
2824                        s_idx = 0;
2825                rcu_read_lock_bh();
2826                for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2827                     rt = rcu_dereference(rt->u.rt_next), idx++) {
2828                        if (idx < s_idx)
2829                                continue;
2830                        skb->dst = dst_clone(&rt->u.dst);
2831                        if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2832                                         cb->nlh->nlmsg_seq, RTM_NEWROUTE, 
2833                                         1, NLM_F_MULTI) <= 0) {
2834                                dst_release(xchg(&skb->dst, NULL));
2835                                rcu_read_unlock_bh();
2836                                goto done;
2837                        }
2838                        dst_release(xchg(&skb->dst, NULL));
2839                }
2840                rcu_read_unlock_bh();
2841        }
2842
2843done:
2844        cb->args[0] = h;
2845        cb->args[1] = idx;
2846        return skb->len;
2847}
2848
2849void ip_rt_multicast_event(struct in_device *in_dev)
2850{
2851        rt_cache_flush(0);
2852}
2853
2854#ifdef CONFIG_SYSCTL
2855static int flush_delay;
2856
2857static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2858                                        struct file *filp, void __user *buffer,
2859                                        size_t *lenp, loff_t *ppos)
2860{
2861        if (write) {
2862                proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2863                rt_cache_flush(flush_delay);
2864                return 0;
2865        } 
2866
2867        return -EINVAL;
2868}
2869
2870static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2871                                                int __user *name,
2872                                                int nlen,
2873                                                void __user *oldval,
2874                                                size_t __user *oldlenp,
2875                                                void __user *newval,
2876                                                size_t newlen)
2877{
2878        int delay;
2879        if (newlen != sizeof(int))
2880                return -EINVAL;
2881        if (get_user(delay, (int __user *)newval))
2882                return -EFAULT; 
2883        rt_cache_flush(delay); 
2884        return 0;
2885}
2886
2887ctl_table ipv4_route_table[] = {
2888        {
2889                .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2890                .procname       = "flush",
2891                .data           = &flush_delay,
2892                .maxlen         = sizeof(int),
2893                .mode           = 0200,
2894                .proc_handler   = &ipv4_sysctl_rtcache_flush,
2895                .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2896        },
2897        {
2898                .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2899                .procname       = "min_delay",
2900                .data           = &ip_rt_min_delay,
2901                .maxlen         = sizeof(int),
2902                .mode           = 0644,
2903                .proc_handler   = &proc_dointvec_jiffies,
2904                .strategy       = &sysctl_jiffies,
2905        },
2906        {
2907                .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2908                .procname       = "max_delay",
2909                .data           = &ip_rt_max_delay,
2910                .maxlen         = sizeof(int),
2911                .mode           = 0644,
2912                .proc_handler   = &proc_dointvec_jiffies,
2913                .strategy       = &sysctl_jiffies,
2914        },
2915        {
2916                .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2917                .procname       = "gc_thresh",
2918                .data           = &ipv4_dst_ops.gc_thresh,
2919                .maxlen         = sizeof(int),
2920                .mode           = 0644,
2921                .proc_handler   = &proc_dointvec,
2922        },
2923        {
2924                .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2925                .procname       = "max_size",
2926                .data           = &ip_rt_max_size,
2927                .maxlen         = sizeof(int),
2928                .mode           = 0644,
2929                .proc_handler   = &proc_dointvec,
2930        },
2931        {
2932                /*  Deprecated. Use gc_min_interval_ms */
2933 
2934                .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2935                .procname       = "gc_min_interval",
2936                .data           = &ip_rt_gc_min_interval,
2937                .maxlen         = sizeof(int),
2938                .mode           = 0644,
2939                .proc_handler   = &proc_dointvec_jiffies,
2940                .strategy       = &sysctl_jiffies,
2941        },
2942        {
2943                .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2944                .procname       = "gc_min_interval_ms",
2945                .data           = &ip_rt_gc_min_interval,
2946                .maxlen         = sizeof(int),
2947                .mode           = 0644,
2948                .proc_handler   = &proc_dointvec_ms_jiffies,
2949                .strategy       = &sysctl_ms_jiffies,
2950        },
2951        {
2952                .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2953                .procname       = "gc_timeout",
2954                .data           = &ip_rt_gc_timeout,
2955                .maxlen         = sizeof(int),
2956                .mode           = 0644,
2957                .proc_handler   = &proc_dointvec_jiffies,
2958                .strategy       = &sysctl_jiffies,
2959        },
2960        {
2961                .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2962                .procname       = "gc_interval",
2963                .data           = &ip_rt_gc_interval,
2964                .maxlen         = sizeof(int),
2965                .mode           = 0644,
2966                .proc_handler   = &proc_dointvec_jiffies,
2967                .strategy       = &sysctl_jiffies,
2968        },
2969        {
2970                .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2971                .procname       = "redirect_load",
2972                .data           = &ip_rt_redirect_load,
2973                .maxlen         = sizeof(int),
2974                .mode           = 0644,
2975                .proc_handler   = &proc_dointvec,
2976        },
2977        {
2978                .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2979                .procname       = "redirect_number",
2980                .data           = &ip_rt_redirect_number,
2981                .maxlen         = sizeof(int),
2982                .mode           = 0644,
2983                .proc_handler   = &proc_dointvec,
2984        },
2985        {
2986                .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2987                .procname       = "redirect_silence",
2988                .data           = &ip_rt_redirect_silence,
2989                .maxlen         = sizeof(int),
2990                .mode           = 0644,
2991                .proc_handler   = &proc_dointvec,
2992        },
2993        {
2994                .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2995                .procname       = "error_cost",
2996                .data           = &ip_rt_error_cost,
2997                .maxlen         = sizeof(int),
2998                .mode           = 0644,
2999                .proc_handler   = &proc_dointvec,
3000        },
3001        {
3002                .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
3003                .procname       = "error_burst",
3004                .data           = &ip_rt_error_burst,
3005                .maxlen         = sizeof(int),
3006                .mode           = 0644,
3007                .proc_handler   = &proc_dointvec,
3008        },
3009        {
3010                .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
3011                .procname       = "gc_elasticity",
3012                .data           = &ip_rt_gc_elasticity,
3013                .maxlen         = sizeof(int),
3014                .mode           = 0644,
3015                .proc_handler   = &proc_dointvec,
3016        },
3017        {
3018                .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3019                .procname       = "mtu_expires",
3020                .data           = &ip_rt_mtu_expires,
3021                .maxlen         = sizeof(int),
3022                .mode           = 0644,
3023                .proc_handler   = &proc_dointvec_jiffies,
3024                .strategy       = &sysctl_jiffies,
3025        },
3026        {
3027                .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3028                .procname       = "min_pmtu",
3029                .data           = &ip_rt_min_pmtu,
3030                .maxlen         = sizeof(int),
3031                .mode           = 0644,
3032                .proc_handler   = &proc_dointvec,
3033        },
3034        {
3035                .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3036                .procname       = "min_adv_mss",
3037                .data           = &ip_rt_min_advmss,
3038                .maxlen         = sizeof(int),
3039                .mode           = 0644,
3040                .proc_handler   = &proc_dointvec,
3041        },
3042        {
3043                .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3044                .procname       = "secret_interval",
3045                .data           = &ip_rt_secret_interval,
3046                .maxlen         = sizeof(int),
3047                .mode           = 0644,
3048                .proc_handler   = &proc_dointvec_jiffies,
3049                .strategy       = &sysctl_jiffies,
3050        },
3051        { .ctl_name = 0 }
3052};
3053#endif
3054
3055#ifdef CONFIG_NET_CLS_ROUTE
3056struct ip_rt_acct *ip_rt_acct;
3057
3058/* This code sucks.  But you should have seen it before! --RR */
3059
3060/* IP route accounting ptr for this logical cpu number. */
3061#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3062
3063#ifdef CONFIG_PROC_FS
3064static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3065                           int length, int *eof, void *data)
3066{
3067        unsigned int i;
3068
3069        if ((offset & 3) || (length & 3))
3070                return -EIO;
3071
3072        if (offset >= sizeof(struct ip_rt_acct) * 256) {
3073                *eof = 1;
3074                return 0;
3075        }
3076
3077        if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3078                length = sizeof(struct ip_rt_acct) * 256 - offset;
3079                *eof = 1;
3080        }
3081
3082        offset /= sizeof(u32);
3083
3084        if (length > 0) {
3085                u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3086                u32 *dst = (u32 *) buffer;
3087
3088                /* Copy first cpu. */
3089                *start = buffer;
3090                memcpy(dst, src, length);
3091
3092                /* Add the other cpus in, one int at a time */
3093                for_each_possible_cpu(i) {
3094                        unsigned int j;
3095
3096                        src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3097
3098                        for (j = 0; j < length/4; j++)
3099                                dst[j] += src[j];
3100                }
3101        }
3102        return length;
3103}
3104#endif /* CONFIG_PROC_FS */
3105#endif /* CONFIG_NET_CLS_ROUTE */
3106
3107static __initdata unsigned long rhash_entries;
3108static int __init set_rhash_entries(char *str)
3109{
3110        if (!str)
3111                return 0;
3112        rhash_entries = simple_strtoul(str, &str, 0);
3113        return 1;
3114}
3115__setup("rhash_entries=", set_rhash_entries);
3116
3117int __init ip_rt_init(void)
3118{
3119        int rc = 0;
3120
3121        rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3122                             (jiffies ^ (jiffies >> 7)));
3123
3124#ifdef CONFIG_NET_CLS_ROUTE
3125        {
3126        int order;
3127        for (order = 0;
3128             (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3129                /* NOTHING */;
3130        ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3131        if (!ip_rt_acct)
3132                panic("IP: failed to allocate ip_rt_acct\n");
3133        memset(ip_rt_acct, 0, PAGE_SIZE << order);
3134        }
3135#endif
3136
3137        ipv4_dst_ops.kmem_cachep =
3138                kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3139                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
3140
3141        rt_hash_table = (struct rt_hash_bucket *)
3142                alloc_large_system_hash("IP route cache",
3143                                        sizeof(struct rt_hash_bucket),
3144                                        rhash_entries,
3145                                        (num_physpages >= 128 * 1024) ?
3146                                        15 : 17,
3147                                        0,
3148                                        &rt_hash_log,
3149                                        &rt_hash_mask,
3150                                        0);
3151        memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3152        rt_hash_lock_init();
3153
3154        ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3155        ip_rt_max_size = (rt_hash_mask + 1) * 16;
3156
3157        devinet_init();
3158        ip_fib_init();
3159
3160        init_timer(&rt_flush_timer);
3161        rt_flush_timer.function = rt_run_flush;
3162        init_timer(&rt_periodic_timer);
3163        rt_periodic_timer.function = rt_check_expire;
3164        init_timer(&rt_secret_timer);
3165        rt_secret_timer.function = rt_secret_rebuild;
3166
3167        /* All the timers, started at system startup tend
3168           to synchronize. Perturb it a bit.
3169         */
3170        rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3171                                        ip_rt_gc_interval;
3172        add_timer(&rt_periodic_timer);
3173
3174        rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3175                ip_rt_secret_interval;
3176        add_timer(&rt_secret_timer);
3177
3178#ifdef CONFIG_PROC_FS
3179        {
3180        struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3181        if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3182            !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, 
3183                                             proc_net_stat))) {
3184                return -ENOMEM;
3185        }
3186        rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3187        }
3188#ifdef CONFIG_NET_CLS_ROUTE
3189        create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3190#endif
3191#endif
3192#ifdef CONFIG_XFRM
3193        xfrm_init();
3194        xfrm4_init();
3195#endif
3196        return rc;
3197}
3198
3199EXPORT_SYMBOL(__ip_select_ident);
3200EXPORT_SYMBOL(ip_route_input);
3201EXPORT_SYMBOL(ip_route_output_key);
3202
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.