linux-bk/net/ipv4/route.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              ROUTE - implementation of the IP router.
   7 *
   8 * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
   9 *
  10 * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  13 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  14 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  15 *
  16 * Fixes:
  17 *              Alan Cox        :       Verify area fixes.
  18 *              Alan Cox        :       cli() protects routing changes
  19 *              Rui Oliveira    :       ICMP routing table updates
  20 *              (rco@di.uminho.pt)      Routing table insertion and update
  21 *              Linus Torvalds  :       Rewrote bits to be sensible
  22 *              Alan Cox        :       Added BSD route gw semantics
  23 *              Alan Cox        :       Super /proc >4K 
  24 *              Alan Cox        :       MTU in route table
  25 *              Alan Cox        :       MSS actually. Also added the window
  26 *                                      clamper.
  27 *              Sam Lantinga    :       Fixed route matching in rt_del()
  28 *              Alan Cox        :       Routing cache support.
  29 *              Alan Cox        :       Removed compatibility cruft.
  30 *              Alan Cox        :       RTF_REJECT support.
  31 *              Alan Cox        :       TCP irtt support.
  32 *              Jonathan Naylor :       Added Metric support.
  33 *      Miquel van Smoorenburg  :       BSD API fixes.
  34 *      Miquel van Smoorenburg  :       Metrics.
  35 *              Alan Cox        :       Use __u32 properly
  36 *              Alan Cox        :       Aligned routing errors more closely with BSD
  37 *                                      our system is still very different.
  38 *              Alan Cox        :       Faster /proc handling
  39 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  40 *                                      routing caches and better behaviour.
  41 *              
  42 *              Olaf Erb        :       irtt wasn't being copied right.
  43 *              Bjorn Ekwall    :       Kerneld route support.
  44 *              Alan Cox        :       Multicast fixed (I hope)
  45 *              Pavel Krauz     :       Limited broadcast fixed
  46 *              Mike McLagan    :       Routing by source
  47 *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  48 *                                      route.c and rewritten from scratch.
  49 *              Andi Kleen      :       Load-limit warning messages.
  50 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  51 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  52 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  53 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  54 *              Marc Boucher    :       routing by fwmark
  55 *      Robert Olsson           :       Added rt_cache statistics
  56 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  57 *
  58 *              This program is free software; you can redistribute it and/or
  59 *              modify it under the terms of the GNU General Public License
  60 *              as published by the Free Software Foundation; either version
  61 *              2 of the License, or (at your option) any later version.
  62 */
  63
  64#include <linux/config.h>
  65#include <linux/module.h>
  66#include <asm/uaccess.h>
  67#include <asm/system.h>
  68#include <asm/bitops.h>
  69#include <linux/types.h>
  70#include <linux/kernel.h>
  71#include <linux/sched.h>
  72#include <linux/mm.h>
  73#include <linux/string.h>
  74#include <linux/socket.h>
  75#include <linux/sockios.h>
  76#include <linux/errno.h>
  77#include <linux/in.h>
  78#include <linux/inet.h>
  79#include <linux/netdevice.h>
  80#include <linux/proc_fs.h>
  81#include <linux/init.h>
  82#include <linux/skbuff.h>
  83#include <linux/rtnetlink.h>
  84#include <linux/inetdevice.h>
  85#include <linux/igmp.h>
  86#include <linux/pkt_sched.h>
  87#include <linux/mroute.h>
  88#include <linux/netfilter_ipv4.h>
  89#include <linux/random.h>
  90#include <linux/jhash.h>
  91#include <linux/rcupdate.h>
  92#include <linux/times.h>
  93#include <net/protocol.h>
  94#include <net/ip.h>
  95#include <net/route.h>
  96#include <net/inetpeer.h>
  97#include <net/sock.h>
  98#include <net/ip_fib.h>
  99#include <net/arp.h>
 100#include <net/tcp.h>
 101#include <net/icmp.h>
 102#include <net/xfrm.h>
 103#ifdef CONFIG_SYSCTL
 104#include <linux/sysctl.h>
 105#endif
 106
 107#define IP_MAX_MTU      0xFFF0
 108
 109#define RT_GC_TIMEOUT (300*HZ)
 110
 111int ip_rt_min_delay             = 2 * HZ;
 112int ip_rt_max_delay             = 10 * HZ;
 113int ip_rt_max_size;
 114int ip_rt_gc_timeout            = RT_GC_TIMEOUT;
 115int ip_rt_gc_interval           = 60 * HZ;
 116int ip_rt_gc_min_interval       = HZ / 2;
 117int ip_rt_redirect_number       = 9;
 118int ip_rt_redirect_load         = HZ / 50;
 119int ip_rt_redirect_silence      = ((HZ / 50) << (9 + 1));
 120int ip_rt_error_cost            = HZ;
 121int ip_rt_error_burst           = 5 * HZ;
 122int ip_rt_gc_elasticity         = 8;
 123int ip_rt_mtu_expires           = 10 * 60 * HZ;
 124int ip_rt_min_pmtu              = 512 + 20 + 20;
 125int ip_rt_min_advmss            = 256;
 126int ip_rt_secret_interval       = 10 * 60 * HZ;
 127static unsigned long rt_deadline;
 128
 129#define RTprint(a...)   printk(KERN_DEBUG a)
 130
 131static struct timer_list rt_flush_timer;
 132static struct timer_list rt_periodic_timer;
 133static struct timer_list rt_secret_timer;
 134
 135/*
 136 *      Interface to generic destination cache.
 137 */
 138
 139static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 140static void              ipv4_dst_destroy(struct dst_entry *dst);
 141static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 142static void              ipv4_link_failure(struct sk_buff *skb);
 143static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 144static int rt_garbage_collect(void);
 145
 146
 147struct dst_ops ipv4_dst_ops = {
 148        .family =               AF_INET,
 149        .protocol =             __constant_htons(ETH_P_IP),
 150        .gc =                   rt_garbage_collect,
 151        .check =                ipv4_dst_check,
 152        .destroy =              ipv4_dst_destroy,
 153        .negative_advice =      ipv4_negative_advice,
 154        .link_failure =         ipv4_link_failure,
 155        .update_pmtu =          ip_rt_update_pmtu,
 156        .entry_size =           sizeof(struct rtable),
 157};
 158
 159#define ECN_OR_COST(class)      TC_PRIO_##class
 160
 161__u8 ip_tos2prio[16] = {
 162        TC_PRIO_BESTEFFORT,
 163        ECN_OR_COST(FILLER),
 164        TC_PRIO_BESTEFFORT,
 165        ECN_OR_COST(BESTEFFORT),
 166        TC_PRIO_BULK,
 167        ECN_OR_COST(BULK),
 168        TC_PRIO_BULK,
 169        ECN_OR_COST(BULK),
 170        TC_PRIO_INTERACTIVE,
 171        ECN_OR_COST(INTERACTIVE),
 172        TC_PRIO_INTERACTIVE,
 173        ECN_OR_COST(INTERACTIVE),
 174        TC_PRIO_INTERACTIVE_BULK,
 175        ECN_OR_COST(INTERACTIVE_BULK),
 176        TC_PRIO_INTERACTIVE_BULK,
 177        ECN_OR_COST(INTERACTIVE_BULK)
 178};
 179
 180
 181/*
 182 * Route cache.
 183 */
 184
 185/* The locking scheme is rather straight forward:
 186 *
 187 * 1) Read-Copy Update protects the buckets of the central route hash.
 188 * 2) Only writers remove entries, and they hold the lock
 189 *    as they look at rtable reference counts.
 190 * 3) Only readers acquire references to rtable entries,
 191 *    they do so with atomic increments and with the
 192 *    lock held.
 193 */
 194
 195struct rt_hash_bucket {
 196        struct rtable   *chain;
 197        spinlock_t      lock;
 198} __attribute__((__aligned__(8)));
 199
 200static struct rt_hash_bucket    *rt_hash_table;
 201static unsigned                 rt_hash_mask;
 202static int                      rt_hash_log;
 203static unsigned int             rt_hash_rnd;
 204
 205struct rt_cache_stat *rt_cache_stat;
 206
 207static int rt_intern_hash(unsigned hash, struct rtable *rth,
 208                                struct rtable **res);
 209
 210static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
 211{
 212        return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
 213                & rt_hash_mask);
 214}
 215
 216#ifdef CONFIG_PROC_FS
 217struct rt_cache_iter_state {
 218        int bucket;
 219};
 220
 221static struct rtable *rt_cache_get_first(struct seq_file *seq)
 222{
 223        struct rtable *r = NULL;
 224        struct rt_cache_iter_state *st = seq->private;
 225
 226        for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 227                rcu_read_lock();
 228                r = rt_hash_table[st->bucket].chain;
 229                if (r)
 230                        break;
 231                rcu_read_unlock();
 232        }
 233        return r;
 234}
 235
 236static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
 237{
 238        struct rt_cache_iter_state *st = seq->private;
 239
 240        smp_read_barrier_depends();
 241        r = r->u.rt_next;
 242        while (!r) {
 243                rcu_read_unlock();
 244                if (--st->bucket < 0)
 245                        break;
 246                rcu_read_lock();
 247                r = rt_hash_table[st->bucket].chain;
 248        }
 249        return r;
 250}
 251
 252static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 253{
 254        struct rtable *r = rt_cache_get_first(seq);
 255
 256        if (r)
 257                while (pos && (r = rt_cache_get_next(seq, r)))
 258                        --pos;
 259        return pos ? NULL : r;
 260}
 261
 262static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 263{
 264        return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 265}
 266
 267static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 268{
 269        struct rtable *r = NULL;
 270
 271        if (v == SEQ_START_TOKEN)
 272                r = rt_cache_get_first(seq);
 273        else
 274                r = rt_cache_get_next(seq, v);
 275        ++*pos;
 276        return r;
 277}
 278
 279static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 280{
 281        if (v && v != SEQ_START_TOKEN)
 282                rcu_read_unlock();
 283}
 284
 285static int rt_cache_seq_show(struct seq_file *seq, void *v)
 286{
 287        if (v == SEQ_START_TOKEN)
 288                seq_printf(seq, "%-127s\n",
 289                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 290                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 291                           "HHUptod\tSpecDst");
 292        else {
 293                struct rtable *r = v;
 294                char temp[256];
 295
 296                sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 297                              "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 298                        r->u.dst.dev ? r->u.dst.dev->name : "*",
 299                        (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 300                        r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 301                        r->u.dst.__use, 0, (unsigned long)r->rt_src,
 302                        (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 303                             (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 304                        dst_metric(&r->u.dst, RTAX_WINDOW),
 305                        (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 306                              dst_metric(&r->u.dst, RTAX_RTTVAR)),
 307                        r->fl.fl4_tos,
 308                        r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 309                        r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 310                                       dev_queue_xmit) : 0,
 311                        r->rt_spec_dst);
 312                seq_printf(seq, "%-127s\n", temp);
 313        }
 314        return 0;
 315}
 316
 317static struct seq_operations rt_cache_seq_ops = {
 318        .start  = rt_cache_seq_start,
 319        .next   = rt_cache_seq_next,
 320        .stop   = rt_cache_seq_stop,
 321        .show   = rt_cache_seq_show,
 322};
 323
 324static int rt_cache_seq_open(struct inode *inode, struct file *file)
 325{
 326        struct seq_file *seq;
 327        int rc = -ENOMEM;
 328        struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
 329
 330        if (!s)
 331                goto out;
 332        rc = seq_open(file, &rt_cache_seq_ops);
 333        if (rc)
 334                goto out_kfree;
 335        seq          = file->private_data;
 336        seq->private = s;
 337        memset(s, 0, sizeof(*s));
 338out:
 339        return rc;
 340out_kfree:
 341        kfree(s);
 342        goto out;
 343}
 344
 345static struct file_operations rt_cache_seq_fops = {
 346        .owner   = THIS_MODULE,
 347        .open    = rt_cache_seq_open,
 348        .read    = seq_read,
 349        .llseek  = seq_lseek,
 350        .release = seq_release_private,
 351};
 352
 353
 354static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 355{
 356        int cpu;
 357
 358        for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 359                if (!cpu_possible(cpu))
 360                        continue;
 361                *pos = cpu;
 362                return per_cpu_ptr(rt_cache_stat, cpu);
 363        }
 364        return NULL;
 365}
 366
 367static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 368{
 369        int cpu;
 370
 371        for (cpu = *pos + 1; cpu < NR_CPUS; ++cpu) {
 372                if (!cpu_possible(cpu))
 373                        continue;
 374                *pos = cpu;
 375                return per_cpu_ptr(rt_cache_stat, cpu);
 376        }
 377        return NULL;
 378        
 379}
 380
 381static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 382{
 383
 384}
 385
 386static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 387{
 388        struct rt_cache_stat *st = v;
 389        
 390        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 391                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 392                   atomic_read(&ipv4_dst_ops.entries),
 393                   st->in_hit,
 394                   st->in_slow_tot,
 395                   st->in_slow_mc,
 396                   st->in_no_route,
 397                   st->in_brd,
 398                   st->in_martian_dst,
 399                   st->in_martian_src,
 400
 401                   st->out_hit,
 402                   st->out_slow_tot,
 403                   st->out_slow_mc, 
 404
 405                   st->gc_total,
 406                   st->gc_ignored,
 407                   st->gc_goal_miss,
 408                   st->gc_dst_overflow,
 409                   st->in_hlist_search,
 410                   st->out_hlist_search
 411                );
 412        return 0;
 413}
 414
 415static struct seq_operations rt_cpu_seq_ops = {
 416        .start  = rt_cpu_seq_start,
 417        .next   = rt_cpu_seq_next,
 418        .stop   = rt_cpu_seq_stop,
 419        .show   = rt_cpu_seq_show,
 420};
 421
 422
 423static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 424{
 425        return seq_open(file, &rt_cpu_seq_ops);
 426}
 427
 428static struct file_operations rt_cpu_seq_fops = {
 429        .owner   = THIS_MODULE,
 430        .open    = rt_cpu_seq_open,
 431        .read    = seq_read,
 432        .llseek  = seq_lseek,
 433        .release = seq_release_private,
 434};
 435
 436#endif /* CONFIG_PROC_FS */
 437  
 438static __inline__ void rt_free(struct rtable *rt)
 439{
 440        call_rcu(&rt->u.dst.rcu_head, (void (*)(void *))dst_free, &rt->u.dst);
 441}
 442
 443static __inline__ void rt_drop(struct rtable *rt)
 444{
 445        ip_rt_put(rt);
 446        call_rcu(&rt->u.dst.rcu_head, (void (*)(void *))dst_free, &rt->u.dst);
 447}
 448
 449static __inline__ int rt_fast_clean(struct rtable *rth)
 450{
 451        /* Kill broadcast/multicast entries very aggresively, if they
 452           collide in hash table with more useful entries */
 453        return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 454                rth->fl.iif && rth->u.rt_next;
 455}
 456
 457static __inline__ int rt_valuable(struct rtable *rth)
 458{
 459        return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 460                rth->u.dst.expires;
 461}
 462
 463static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 464{
 465        unsigned long age;
 466        int ret = 0;
 467
 468        if (atomic_read(&rth->u.dst.__refcnt))
 469                goto out;
 470
 471        ret = 1;
 472        if (rth->u.dst.expires &&
 473            time_after_eq(jiffies, rth->u.dst.expires))
 474                goto out;
 475
 476        age = jiffies - rth->u.dst.lastuse;
 477        ret = 0;
 478        if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 479            (age <= tmo2 && rt_valuable(rth)))
 480                goto out;
 481        ret = 1;
 482out:    return ret;
 483}
 484
 485/* Bits of score are:
 486 * 31: very valuable
 487 * 30: not quite useless
 488 * 29..0: usage counter
 489 */
 490static inline u32 rt_score(struct rtable *rt)
 491{
 492        u32 score = jiffies - rt->u.dst.lastuse;
 493
 494        score = ~score & ~(3<<30);
 495
 496        if (rt_valuable(rt))
 497                score |= (1<<31);
 498
 499        if (!rt->fl.iif ||
 500            !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 501                score |= (1<<30);
 502
 503        return score;
 504}
 505
 506/* This runs via a timer and thus is always in BH context. */
 507static void rt_check_expire(unsigned long dummy)
 508{
 509        static int rover;
 510        int i = rover, t;
 511        struct rtable *rth, **rthp;
 512        unsigned long now = jiffies;
 513
 514        for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
 515             t -= ip_rt_gc_timeout) {
 516                unsigned long tmo = ip_rt_gc_timeout;
 517
 518                i = (i + 1) & rt_hash_mask;
 519                rthp = &rt_hash_table[i].chain;
 520
 521                spin_lock(&rt_hash_table[i].lock);
 522                while ((rth = *rthp) != NULL) {
 523                        if (rth->u.dst.expires) {
 524                                /* Entry is expired even if it is in use */
 525                                if (time_before_eq(now, rth->u.dst.expires)) {
 526                                        tmo >>= 1;
 527                                        rthp = &rth->u.rt_next;
 528                                        continue;
 529                                }
 530                        } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 531                                tmo >>= 1;
 532                                rthp = &rth->u.rt_next;
 533                                continue;
 534                        }
 535
 536                        /* Cleanup aged off entries. */
 537                        *rthp = rth->u.rt_next;
 538                        rt_free(rth);
 539                }
 540                spin_unlock(&rt_hash_table[i].lock);
 541
 542                /* Fallback loop breaker. */
 543                if (time_after(jiffies, now))
 544                        break;
 545        }
 546        rover = i;
 547        mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
 548}
 549
 550/* This can run from both BH and non-BH contexts, the latter
 551 * in the case of a forced flush event.
 552 */
 553static void rt_run_flush(unsigned long dummy)
 554{
 555        int i;
 556        struct rtable *rth, *next;
 557
 558        rt_deadline = 0;
 559
 560        get_random_bytes(&rt_hash_rnd, 4);
 561
 562        for (i = rt_hash_mask; i >= 0; i--) {
 563                spin_lock_bh(&rt_hash_table[i].lock);
 564                rth = rt_hash_table[i].chain;
 565                if (rth)
 566                        rt_hash_table[i].chain = NULL;
 567                spin_unlock_bh(&rt_hash_table[i].lock);
 568
 569                for (; rth; rth = next) {
 570                        next = rth->u.rt_next;
 571                        rt_free(rth);
 572                }
 573        }
 574}
 575
 576static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
 577
 578void rt_cache_flush(int delay)
 579{
 580        unsigned long now = jiffies;
 581        int user_mode = !in_softirq();
 582
 583        if (delay < 0)
 584                delay = ip_rt_min_delay;
 585
 586        spin_lock_bh(&rt_flush_lock);
 587
 588        if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
 589                long tmo = (long)(rt_deadline - now);
 590
 591                /* If flush timer is already running
 592                   and flush request is not immediate (delay > 0):
 593
 594                   if deadline is not achieved, prolongate timer to "delay",
 595                   otherwise fire it at deadline time.
 596                 */
 597
 598                if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
 599                        tmo = 0;
 600                
 601                if (delay > tmo)
 602                        delay = tmo;
 603        }
 604
 605        if (delay <= 0) {
 606                spin_unlock_bh(&rt_flush_lock);
 607                rt_run_flush(0);
 608                return;
 609        }
 610
 611        if (rt_deadline == 0)
 612                rt_deadline = now + ip_rt_max_delay;
 613
 614        mod_timer(&rt_flush_timer, now+delay);
 615        spin_unlock_bh(&rt_flush_lock);
 616}
 617
 618static void rt_secret_rebuild(unsigned long dummy)
 619{
 620        unsigned long now = jiffies;
 621
 622        rt_cache_flush(0);
 623        mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
 624}
 625
 626/*
 627   Short description of GC goals.
 628
 629   We want to build algorithm, which will keep routing cache
 630   at some equilibrium point, when number of aged off entries
 631   is kept approximately equal to newly generated ones.
 632
 633   Current expiration strength is variable "expire".
 634   We try to adjust it dynamically, so that if networking
 635   is idle expires is large enough to keep enough of warm entries,
 636   and when load increases it reduces to limit cache size.
 637 */
 638
 639static int rt_garbage_collect(void)
 640{
 641        static unsigned long expire = RT_GC_TIMEOUT;
 642        static unsigned long last_gc;
 643        static int rover;
 644        static int equilibrium;
 645        struct rtable *rth, **rthp;
 646        unsigned long now = jiffies;
 647        int goal;
 648
 649        /*
 650         * Garbage collection is pretty expensive,
 651         * do not make it too frequently.
 652         */
 653
 654        RT_CACHE_STAT_INC(gc_total);
 655
 656        if (now - last_gc < ip_rt_gc_min_interval &&
 657            atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 658                RT_CACHE_STAT_INC(gc_ignored);
 659                goto out;
 660        }
 661
 662        /* Calculate number of entries, which we want to expire now. */
 663        goal = atomic_read(&ipv4_dst_ops.entries) -
 664                (ip_rt_gc_elasticity << rt_hash_log);
 665        if (goal <= 0) {
 666                if (equilibrium < ipv4_dst_ops.gc_thresh)
 667                        equilibrium = ipv4_dst_ops.gc_thresh;
 668                goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 669                if (goal > 0) {
 670                        equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
 671                        goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 672                }
 673        } else {
 674                /* We are in dangerous area. Try to reduce cache really
 675                 * aggressively.
 676                 */
 677                goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
 678                equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 679        }
 680
 681        if (now - last_gc >= ip_rt_gc_min_interval)
 682                last_gc = now;
 683
 684        if (goal <= 0) {
 685                equilibrium += goal;
 686                goto work_done;
 687        }
 688
 689        do {
 690                int i, k;
 691
 692                for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 693                        unsigned long tmo = expire;
 694
 695                        k = (k + 1) & rt_hash_mask;
 696                        rthp = &rt_hash_table[k].chain;
 697                        spin_lock_bh(&rt_hash_table[k].lock);
 698                        while ((rth = *rthp) != NULL) {
 699                                if (!rt_may_expire(rth, tmo, expire)) {
 700                                        tmo >>= 1;
 701                                        rthp = &rth->u.rt_next;
 702                                        continue;
 703                                }
 704                                *rthp = rth->u.rt_next;
 705                                rt_free(rth);
 706                                goal--;
 707                        }
 708                        spin_unlock_bh(&rt_hash_table[k].lock);
 709                        if (goal <= 0)
 710                                break;
 711                }
 712                rover = k;
 713
 714                if (goal <= 0)
 715                        goto work_done;
 716
 717                /* Goal is not achieved. We stop process if:
 718
 719                   - if expire reduced to zero. Otherwise, expire is halfed.
 720                   - if table is not full.
 721                   - if we are called from interrupt.
 722                   - jiffies check is just fallback/debug loop breaker.
 723                     We will not spin here for long time in any case.
 724                 */
 725
 726                RT_CACHE_STAT_INC(gc_goal_miss);
 727
 728                if (expire == 0)
 729                        break;
 730
 731                expire >>= 1;
 732#if RT_CACHE_DEBUG >= 2
 733                printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 734                                atomic_read(&ipv4_dst_ops.entries), goal, i);
 735#endif
 736
 737                if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 738                        goto out;
 739        } while (!in_softirq() && time_before_eq(jiffies, now));
 740
 741        if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 742                goto out;
 743        if (net_ratelimit())
 744                printk(KERN_WARNING "dst cache overflow\n");
 745        RT_CACHE_STAT_INC(gc_dst_overflow);
 746        return 1;
 747
 748work_done:
 749        expire += ip_rt_gc_min_interval;
 750        if (expire > ip_rt_gc_timeout ||
 751            atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 752                expire = ip_rt_gc_timeout;
 753#if RT_CACHE_DEBUG >= 2
 754        printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 755                        atomic_read(&ipv4_dst_ops.entries), goal, rover);
 756#endif
 757out:    return 0;
 758}
 759
 760static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 761{
 762        return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
 763               fl1->oif     == fl2->oif &&
 764               fl1->iif     == fl2->iif;
 765}
 766
 767static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 768{
 769        struct rtable   *rth, **rthp;
 770        unsigned long   now;
 771        struct rtable *cand, **candp;
 772        u32             min_score;
 773        int             chain_length;
 774        int attempts = !in_softirq();
 775
 776restart:
 777        chain_length = 0;
 778        min_score = ~(u32)0;
 779        cand = NULL;
 780        candp = NULL;
 781        now = jiffies;
 782
 783        rthp = &rt_hash_table[hash].chain;
 784
 785        spin_lock_bh(&rt_hash_table[hash].lock);
 786        while ((rth = *rthp) != NULL) {
 787                if (compare_keys(&rth->fl, &rt->fl)) {
 788                        /* Put it first */
 789                        *rthp = rth->u.rt_next;
 790                        /*
 791                         * Since lookup is lockfree, the deletion
 792                         * must be visible to another weakly ordered CPU before
 793                         * the insertion at the start of the hash chain.
 794                         */
 795                        smp_wmb();
 796                        rth->u.rt_next = rt_hash_table[hash].chain;
 797                        /*
 798                         * Since lookup is lockfree, the update writes
 799                         * must be ordered for consistency on SMP.
 800                         */
 801                        smp_wmb();
 802                        rt_hash_table[hash].chain = rth;
 803
 804                        rth->u.dst.__use++;
 805                        dst_hold(&rth->u.dst);
 806                        rth->u.dst.lastuse = now;
 807                        spin_unlock_bh(&rt_hash_table[hash].lock);
 808
 809                        rt_drop(rt);
 810                        *rp = rth;
 811                        return 0;
 812                }
 813
 814                if (!atomic_read(&rth->u.dst.__refcnt)) {
 815                        u32 score = rt_score(rth);
 816
 817                        if (score <= min_score) {
 818                                cand = rth;
 819                                candp = rthp;
 820                                min_score = score;
 821                        }
 822                }
 823
 824                chain_length++;
 825
 826                rthp = &rth->u.rt_next;
 827        }
 828
 829        if (cand) {
 830                /* ip_rt_gc_elasticity used to be average length of chain
 831                 * length, when exceeded gc becomes really aggressive.
 832                 *
 833                 * The second limit is less certain. At the moment it allows
 834                 * only 2 entries per bucket. We will see.
 835                 */
 836                if (chain_length > ip_rt_gc_elasticity) {
 837                        *candp = cand->u.rt_next;
 838                        rt_free(cand);
 839                }
 840        }
 841
 842        /* Try to bind route to arp only if it is output
 843           route or unicast forwarding path.
 844         */
 845        if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
 846                int err = arp_bind_neighbour(&rt->u.dst);
 847                if (err) {
 848                        spin_unlock_bh(&rt_hash_table[hash].lock);
 849
 850                        if (err != -ENOBUFS) {
 851                                rt_drop(rt);
 852                                return err;
 853                        }
 854
 855                        /* Neighbour tables are full and nothing
 856                           can be released. Try to shrink route cache,
 857                           it is most likely it holds some neighbour records.
 858                         */
 859                        if (attempts-- > 0) {
 860                                int saved_elasticity = ip_rt_gc_elasticity;
 861                                int saved_int = ip_rt_gc_min_interval;
 862                                ip_rt_gc_elasticity     = 1;
 863                                ip_rt_gc_min_interval   = 0;
 864                                rt_garbage_collect();
 865                                ip_rt_gc_min_interval   = saved_int;
 866                                ip_rt_gc_elasticity     = saved_elasticity;
 867                                goto restart;
 868                        }
 869
 870                        if (net_ratelimit())
 871                                printk(KERN_WARNING "Neighbour table overflow.\n");
 872                        rt_drop(rt);
 873                        return -ENOBUFS;
 874                }
 875        }
 876
 877        rt->u.rt_next = rt_hash_table[hash].chain;
 878#if RT_CACHE_DEBUG >= 2
 879        if (rt->u.rt_next) {
 880                struct rtable *trt;
 881                printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
 882                       NIPQUAD(rt->rt_dst));
 883                for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
 884                        printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
 885                printk("\n");
 886        }
 887#endif
 888        rt_hash_table[hash].chain = rt;
 889        spin_unlock_bh(&rt_hash_table[hash].lock);
 890        *rp = rt;
 891        return 0;
 892}
 893
 894void rt_bind_peer(struct rtable *rt, int create)
 895{
 896        static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED;
 897        struct inet_peer *peer;
 898
 899        peer = inet_getpeer(rt->rt_dst, create);
 900
 901        spin_lock_bh(&rt_peer_lock);
 902        if (rt->peer == NULL) {
 903                rt->peer = peer;
 904                peer = NULL;
 905        }
 906        spin_unlock_bh(&rt_peer_lock);
 907        if (peer)
 908                inet_putpeer(peer);
 909}
 910
 911/*
 912 * Peer allocation may fail only in serious out-of-memory conditions.  However
 913 * we still can generate some output.
 914 * Random ID selection looks a bit dangerous because we have no chances to
 915 * select ID being unique in a reasonable period of time.
 916 * But broken packet identifier may be better than no packet at all.
 917 */
 918static void ip_select_fb_ident(struct iphdr *iph)
 919{
 920        static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED;
 921        static u32 ip_fallback_id;
 922        u32 salt;
 923
 924        spin_lock_bh(&ip_fb_id_lock);
 925        salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
 926        iph->id = htons(salt & 0xFFFF);
 927        ip_fallback_id = salt;
 928        spin_unlock_bh(&ip_fb_id_lock);
 929}
 930
 931void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 932{
 933        struct rtable *rt = (struct rtable *) dst;
 934
 935        if (rt) {
 936                if (rt->peer == NULL)
 937                        rt_bind_peer(rt, 1);
 938
 939                /* If peer is attached to destination, it is never detached,
 940                   so that we need not to grab a lock to dereference it.
 941                 */
 942                if (rt->peer) {
 943                        iph->id = htons(inet_getid(rt->peer, more));
 944                        return;
 945                }
 946        } else
 947                printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
 948
 949        ip_select_fb_ident(iph);
 950}
 951
 952static void rt_del(unsigned hash, struct rtable *rt)
 953{
 954        struct rtable **rthp;
 955
 956        spin_lock_bh(&rt_hash_table[hash].lock);
 957        ip_rt_put(rt);
 958        for (rthp = &rt_hash_table[hash].chain; *rthp;
 959             rthp = &(*rthp)->u.rt_next)
 960                if (*rthp == rt) {
 961                        *rthp = rt->u.rt_next;
 962                        rt_free(rt);
 963                        break;
 964                }
 965        spin_unlock_bh(&rt_hash_table[hash].lock);
 966}
 967
 968void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
 969                    u32 saddr, u8 tos, struct net_device *dev)
 970{
 971        int i, k;
 972        struct in_device *in_dev = in_dev_get(dev);
 973        struct rtable *rth, **rthp;
 974        u32  skeys[2] = { saddr, 0 };
 975        int  ikeys[2] = { dev->ifindex, 0 };
 976
 977        tos &= IPTOS_RT_MASK;
 978
 979        if (!in_dev)
 980                return;
 981
 982        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
 983            || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
 984                goto reject_redirect;
 985
 986        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 987                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 988                        goto reject_redirect;
 989                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 990                        goto reject_redirect;
 991        } else {
 992                if (inet_addr_type(new_gw) != RTN_UNICAST)
 993                        goto reject_redirect;
 994        }
 995
 996        for (i = 0; i < 2; i++) {
 997                for (k = 0; k < 2; k++) {
 998                        unsigned hash = rt_hash_code(daddr,
 999                                                     skeys[i] ^ (ikeys[k] << 5),
1000                                                     tos);
1001
1002                        rthp=&rt_hash_table[hash].chain;
1003
1004                        rcu_read_lock();
1005                        while ((rth = *rthp) != NULL) {
1006                                struct rtable *rt;
1007
1008                                smp_read_barrier_depends();
1009                                if (rth->fl.fl4_dst != daddr ||
1010                                    rth->fl.fl4_src != skeys[i] ||
1011                                    rth->fl.fl4_tos != tos ||
1012                                    rth->fl.oif != ikeys[k] ||
1013                                    rth->fl.iif != 0) {
1014                                        rthp = &rth->u.rt_next;
1015                                        continue;
1016                                }
1017
1018                                if (rth->rt_dst != daddr ||
1019                                    rth->rt_src != saddr ||
1020                                    rth->u.dst.error ||
1021                                    rth->rt_gateway != old_gw ||
1022                                    rth->u.dst.dev != dev)
1023                                        break;
1024
1025                                dst_hold(&rth->u.dst);
1026                                rcu_read_unlock();
1027
1028                                rt = dst_alloc(&ipv4_dst_ops);
1029                                if (rt == NULL) {
1030                                        ip_rt_put(rth);
1031                                        in_dev_put(in_dev);
1032                                        return;
1033                                }
1034
1035                                /* Copy all the information. */
1036                                *rt = *rth;
1037                                INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1038                                rt->u.dst.__use         = 1;
1039                                atomic_set(&rt->u.dst.__refcnt, 1);
1040                                rt->u.dst.child         = NULL;
1041                                if (rt->u.dst.dev)
1042                                        dev_hold(rt->u.dst.dev);
1043                                rt->u.dst.obsolete      = 0;
1044                                rt->u.dst.lastuse       = jiffies;
1045                                rt->u.dst.path          = &rt->u.dst;
1046                                rt->u.dst.neighbour     = NULL;
1047                                rt->u.dst.hh            = NULL;
1048                                rt->u.dst.xfrm          = NULL;
1049
1050                                rt->rt_flags            |= RTCF_REDIRECTED;
1051
1052                                /* Gateway is different ... */
1053                                rt->rt_gateway          = new_gw;
1054
1055                                /* Redirect received -> path was valid */
1056                                dst_confirm(&rth->u.dst);
1057
1058                                if (rt->peer)
1059                                        atomic_inc(&rt->peer->refcnt);
1060
1061                                if (arp_bind_neighbour(&rt->u.dst) ||
1062                                    !(rt->u.dst.neighbour->nud_state &
1063                                            NUD_VALID)) {
1064                                        if (rt->u.dst.neighbour)
1065                                                neigh_event_send(rt->u.dst.neighbour, NULL);
1066                                        ip_rt_put(rth);
1067                                        rt_drop(rt);
1068                                        goto do_next;
1069                                }
1070
1071                                rt_del(hash, rth);
1072                                if (!rt_intern_hash(hash, rt, &rt))
1073                                        ip_rt_put(rt);
1074                                goto do_next;
1075                        }
1076                        rcu_read_unlock();
1077                do_next:
1078                        ;
1079                }
1080        }
1081        in_dev_put(in_dev);
1082        return;
1083
1084reject_redirect:
1085#ifdef CONFIG_IP_ROUTE_VERBOSE
1086        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1087                printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1088                        "%u.%u.%u.%u ignored.\n"
1089                        "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1090                        "tos %02x\n",
1091                       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1092                       NIPQUAD(saddr), NIPQUAD(daddr), tos);
1093#endif
1094        in_dev_put(in_dev);
1095}
1096
1097static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1098{
1099        struct rtable *rt = (struct rtable*)dst;
1100        struct dst_entry *ret = dst;
1101
1102        if (rt) {
1103                if (dst->obsolete) {
1104                        ip_rt_put(rt);
1105                        ret = NULL;
1106                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1107                           rt->u.dst.expires) {
1108                        unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1109                                                     rt->fl.fl4_src ^
1110                                                        (rt->fl.oif << 5),
1111                                                     rt->fl.fl4_tos);
1112#if RT_CACHE_DEBUG >= 1
1113                        printk(KERN_DEBUG "ip_rt_advice: redirect to "
1114                                          "%u.%u.%u.%u/%02x dropped\n",
1115                                NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1116#endif
1117                        rt_del(hash, rt);
1118                        ret = NULL;
1119                }
1120        }
1121        return ret;
1122}
1123
1124/*
1125 * Algorithm:
1126 *      1. The first ip_rt_redirect_number redirects are sent
1127 *         with exponential backoff, then we stop sending them at all,
1128 *         assuming that the host ignores our redirects.
1129 *      2. If we did not see packets requiring redirects
1130 *         during ip_rt_redirect_silence, we assume that the host
1131 *         forgot redirected route and start to send redirects again.
1132 *
1133 * This algorithm is much cheaper and more intelligent than dumb load limiting
1134 * in icmp.c.
1135 *
1136 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1137 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1138 */
1139
1140void ip_rt_send_redirect(struct sk_buff *skb)
1141{
1142        struct rtable *rt = (struct rtable*)skb->dst;
1143        struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1144
1145        if (!in_dev)
1146                return;
1147
1148        if (!IN_DEV_TX_REDIRECTS(in_dev))
1149                goto out;
1150
1151        /* No redirected packets during ip_rt_redirect_silence;
1152         * reset the algorithm.
1153         */
1154        if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1155                rt->u.dst.rate_tokens = 0;
1156
1157        /* Too many ignored redirects; do not send anything
1158         * set u.dst.rate_last to the last seen redirected packet.
1159         */
1160        if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1161                rt->u.dst.rate_last = jiffies;
1162                goto out;
1163        }
1164
1165        /* Check for load limit; set rate_last to the latest sent
1166         * redirect.
1167         */
1168        if (time_after(jiffies,
1169                       (rt->u.dst.rate_last +
1170                        (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1171                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1172                rt->u.dst.rate_last = jiffies;
1173                ++rt->u.dst.rate_tokens;
1174#ifdef CONFIG_IP_ROUTE_VERBOSE
1175                if (IN_DEV_LOG_MARTIANS(in_dev) &&
1176                    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1177                    net_ratelimit())
1178                        printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1179                                "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1180                                NIPQUAD(rt->rt_src), rt->rt_iif,
1181                                NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1182#endif
1183        }
1184out:
1185        in_dev_put(in_dev);
1186}
1187
1188static int ip_error(struct sk_buff *skb)
1189{
1190        struct rtable *rt = (struct rtable*)skb->dst;
1191        unsigned long now;
1192        int code;
1193
1194        switch (rt->u.dst.error) {
1195                case EINVAL:
1196                default:
1197                        goto out;
1198                case EHOSTUNREACH:
1199                        code = ICMP_HOST_UNREACH;
1200                        break;
1201                case ENETUNREACH:
1202                        code = ICMP_NET_UNREACH;
1203                        break;
1204                case EACCES:
1205                        code = ICMP_PKT_FILTERED;
1206                        break;
1207        }
1208
1209        now = jiffies;
1210        rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1211        if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1212                rt->u.dst.rate_tokens = ip_rt_error_burst;
1213        rt->u.dst.rate_last = now;
1214        if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1215                rt->u.dst.rate_tokens -= ip_rt_error_cost;
1216                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1217        }
1218
1219out:    kfree_skb(skb);
1220        return 0;
1221} 
1222
1223/*
1224 *      The last two values are not from the RFC but
1225 *      are needed for AMPRnet AX.25 paths.
1226 */
1227
1228static unsigned short mtu_plateau[] =
1229{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1230
1231static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1232{
1233        int i;
1234        
1235        for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1236                if (old_mtu > mtu_plateau[i])
1237                        return mtu_plateau[i];
1238        return 68;
1239}
1240
1241unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1242{
1243        int i;
1244        unsigned short old_mtu = ntohs(iph->tot_len);
1245        struct rtable *rth;
1246        u32  skeys[2] = { iph->saddr, 0, };
1247        u32  daddr = iph->daddr;
1248        u8   tos = iph->tos & IPTOS_RT_MASK;
1249        unsigned short est_mtu = 0;
1250
1251        if (ipv4_config.no_pmtu_disc)
1252                return 0;
1253
1254        for (i = 0; i < 2; i++) {
1255                unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1256
1257                rcu_read_lock();
1258                for (rth = rt_hash_table[hash].chain; rth;
1259                     rth = rth->u.rt_next) {
1260                        smp_read_barrier_depends();
1261                        if (rth->fl.fl4_dst == daddr &&
1262                            rth->fl.fl4_src == skeys[i] &&
1263                            rth->rt_dst  == daddr &&
1264                            rth->rt_src  == iph->saddr &&
1265                            rth->fl.fl4_tos == tos &&
1266                            rth->fl.iif == 0 &&
1267                            !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1268                                unsigned short mtu = new_mtu;
1269
1270                                if (new_mtu < 68 || new_mtu >= old_mtu) {
1271
1272                                        /* BSD 4.2 compatibility hack :-( */
1273                                        if (mtu == 0 &&
1274                                            old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1275                                            old_mtu >= 68 + (iph->ihl << 2))
1276                                                old_mtu -= iph->ihl << 2;
1277
1278                                        mtu = guess_mtu(old_mtu);
1279                                }
1280                                if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1281                                        if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { 
1282                                                dst_confirm(&rth->u.dst);
1283                                                if (mtu < ip_rt_min_pmtu) {
1284                                                        mtu = ip_rt_min_pmtu;
1285                                                        rth->u.dst.metrics[RTAX_LOCK-1] |=
1286                                                                (1 << RTAX_MTU);
1287                                                }
1288                                                rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1289                                                dst_set_expires(&rth->u.dst,
1290                                                        ip_rt_mtu_expires);
1291                                        }
1292                                        est_mtu = mtu;
1293                                }
1294                        }
1295                }
1296                rcu_read_unlock();
1297        }
1298        return est_mtu ? : new_mtu;
1299}
1300
1301static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1302{
1303        if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1304            !(dst_metric_locked(dst, RTAX_MTU))) {
1305                if (mtu < ip_rt_min_pmtu) {
1306                        mtu = ip_rt_min_pmtu;
1307                        dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1308                }
1309                dst->metrics[RTAX_MTU-1] = mtu;
1310                dst_set_expires(dst, ip_rt_mtu_expires);
1311        }
1312}
1313
1314static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1315{
1316        dst_release(dst);
1317        return NULL;
1318}
1319
1320static void ipv4_dst_destroy(struct dst_entry *dst)
1321{
1322        struct rtable *rt = (struct rtable *) dst;
1323        struct inet_peer *peer = rt->peer;
1324
1325        if (peer) {
1326                rt->peer = NULL;
1327                inet_putpeer(peer);
1328        }
1329}
1330
1331static void ipv4_link_failure(struct sk_buff *skb)
1332{
1333        struct rtable *rt;
1334
1335        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1336
1337        rt = (struct rtable *) skb->dst;
1338        if (rt)
1339                dst_set_expires(&rt->u.dst, 0);
1340}
1341
1342static int ip_rt_bug(struct sk_buff *skb)
1343{
1344        printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1345                NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1346                skb->dev ? skb->dev->name : "?");
1347        kfree_skb(skb);
1348        return 0;
1349}
1350
1351/*
1352   We do not cache source address of outgoing interface,
1353   because it is used only by IP RR, TS and SRR options,
1354   so that it out of fast path.
1355
1356   BTW remember: "addr" is allowed to be not aligned
1357   in IP options!
1358 */
1359
1360void ip_rt_get_source(u8 *addr, struct rtable *rt)
1361{
1362        u32 src;
1363        struct fib_result res;
1364
1365        if (rt->fl.iif == 0)
1366                src = rt->rt_src;
1367        else if (fib_lookup(&rt->fl, &res) == 0) {
1368#ifdef CONFIG_IP_ROUTE_NAT
1369                if (res.type == RTN_NAT)
1370                        src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1371                                                RT_SCOPE_UNIVERSE);
1372                else
1373#endif
1374                        src = FIB_RES_PREFSRC(res);
1375                fib_res_put(&res);
1376        } else
1377                src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1378                                        RT_SCOPE_UNIVERSE);
1379        memcpy(addr, &src, 4);
1380}
1381
1382#ifdef CONFIG_NET_CLS_ROUTE
1383static void set_class_tag(struct rtable *rt, u32 tag)
1384{
1385        if (!(rt->u.dst.tclassid & 0xFFFF))
1386                rt->u.dst.tclassid |= tag & 0xFFFF;
1387        if (!(rt->u.dst.tclassid & 0xFFFF0000))
1388                rt->u.dst.tclassid |= tag & 0xFFFF0000;
1389}
1390#endif
1391
1392static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1393{
1394        struct fib_info *fi = res->fi;
1395
1396        if (fi) {
1397                if (FIB_RES_GW(*res) &&
1398                    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1399                        rt->rt_gateway = FIB_RES_GW(*res);
1400                memcpy(rt->u.dst.metrics, fi->fib_metrics,
1401                       sizeof(rt->u.dst.metrics));
1402                if (fi->fib_mtu == 0) {
1403                        rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1404                        if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1405                            rt->rt_gateway != rt->rt_dst &&
1406                            rt->u.dst.dev->mtu > 576)
1407                                rt->u.dst.metrics[RTAX_MTU-1] = 576;
1408                }
1409#ifdef CONFIG_NET_CLS_ROUTE
1410                rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1411#endif
1412        } else
1413                rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1414
1415        if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1416                rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1417        if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1418                rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1419        if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1420                rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1421                                       ip_rt_min_advmss);
1422        if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1423                rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1424
1425#ifdef CONFIG_NET_CLS_ROUTE
1426#ifdef CONFIG_IP_MULTIPLE_TABLES
1427        set_class_tag(rt, fib_rules_tclass(res));
1428#endif
1429        set_class_tag(rt, itag);
1430#endif
1431        rt->rt_type = res->type;
1432}
1433
1434static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1435                                u8 tos, struct net_device *dev, int our)
1436{
1437        unsigned hash;
1438        struct rtable *rth;
1439        u32 spec_dst;
1440        struct in_device *in_dev = in_dev_get(dev);
1441        u32 itag = 0;
1442
1443        /* Primary sanity checks. */
1444
1445        if (in_dev == NULL)
1446                return -EINVAL;
1447
1448        if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1449            skb->protocol != htons(ETH_P_IP))
1450                goto e_inval;
1451
1452        if (ZERONET(saddr)) {
1453                if (!LOCAL_MCAST(daddr))
1454                        goto e_inval;
1455                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1456        } else if (fib_validate_source(saddr, 0, tos, 0,
1457                                        dev, &spec_dst, &itag) < 0)
1458                goto e_inval;
1459
1460        rth = dst_alloc(&ipv4_dst_ops);
1461        if (!rth)
1462                goto e_nobufs;
1463
1464        rth->u.dst.output= ip_rt_bug;
1465
1466        atomic_set(&rth->u.dst.__refcnt, 1);
1467        rth->u.dst.flags= DST_HOST;
1468        if (in_dev->cnf.no_policy)
1469                rth->u.dst.flags |= DST_NOPOLICY;
1470        rth->fl.fl4_dst = daddr;
1471        rth->rt_dst     = daddr;
1472        rth->fl.fl4_tos = tos;
1473#ifdef CONFIG_IP_ROUTE_FWMARK
1474        rth->fl.fl4_fwmark= skb->nfmark;
1475#endif
1476        rth->fl.fl4_src = saddr;
1477        rth->rt_src     = saddr;
1478#ifdef CONFIG_IP_ROUTE_NAT
1479        rth->rt_dst_map = daddr;
1480        rth->rt_src_map = saddr;
1481#endif
1482#ifdef CONFIG_NET_CLS_ROUTE
1483        rth->u.dst.tclassid = itag;
1484#endif
1485        rth->rt_iif     =
1486        rth->fl.iif     = dev->ifindex;
1487        rth->u.dst.dev  = &loopback_dev;
1488        dev_hold(rth->u.dst.dev);
1489        rth->fl.oif     = 0;
1490        rth->rt_gateway = daddr;
1491        rth->rt_spec_dst= spec_dst;
1492        rth->rt_type    = RTN_MULTICAST;
1493        rth->rt_flags   = RTCF_MULTICAST;
1494        if (our) {
1495                rth->u.dst.input= ip_local_deliver;
1496                rth->rt_flags |= RTCF_LOCAL;
1497        }
1498
1499#ifdef CONFIG_IP_MROUTE
1500        if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1501                rth->u.dst.input = ip_mr_input;
1502#endif
1503        RT_CACHE_STAT_INC(in_slow_mc);
1504
1505        in_dev_put(in_dev);
1506        hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1507        return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1508
1509e_nobufs:
1510        in_dev_put(in_dev);
1511        return -ENOBUFS;
1512
1513e_inval:
1514        in_dev_put(in_dev);
1515        return -EINVAL;
1516}
1517
1518/*
1519 *      NOTE. We drop all the packets that has local source
1520 *      addresses, because every properly looped back packet
1521 *      must have correct destination already attached by output routine.
1522 *
1523 *      Such approach solves two big problems:
1524 *      1. Not simplex devices are handled properly.
1525 *      2. IP spoofing attempts are filtered with 100% of guarantee.
1526 */
1527
1528int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1529                        u8 tos, struct net_device *dev)
1530{
1531        struct fib_result res;
1532        struct in_device *in_dev = in_dev_get(dev);
1533        struct in_device *out_dev = NULL;
1534        struct flowi fl = { .nl_u = { .ip4_u =
1535                                      { .daddr = daddr,
1536                                        .saddr = saddr,
1537                                        .tos = tos,
1538                                        .scope = RT_SCOPE_UNIVERSE,
1539#ifdef CONFIG_IP_ROUTE_FWMARK
1540                                        .fwmark = skb->nfmark
1541#endif
1542                                      } },
1543                            .iif = dev->ifindex };
1544        unsigned        flags = 0;
1545        u32             itag = 0;
1546        struct rtable * rth;
1547        unsigned        hash;
1548        u32             spec_dst;
1549        int             err = -EINVAL;
1550        int             free_res = 0;
1551
1552        /* IP on this device is disabled. */
1553
1554        if (!in_dev)
1555                goto out;
1556
1557        hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
1558
1559        /* Check for the most weird martians, which can be not detected
1560           by fib_lookup.
1561         */
1562
1563        if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1564                goto martian_source;
1565
1566        if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1567                goto brd_input;
1568
1569        /* Accept zero addresses only to limited broadcast;
1570         * I even do not know to fix it or not. Waiting for complains :-)
1571         */
1572        if (ZERONET(saddr))
1573                goto martian_source;
1574
1575        if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1576                goto martian_destination;
1577
1578        /*
1579         *      Now we are ready to route packet.
1580         */
1581        if ((err = fib_lookup(&fl, &res)) != 0) {
1582                if (!IN_DEV_FORWARD(in_dev))
1583                        goto e_inval;
1584                goto no_route;
1585        }
1586        free_res = 1;
1587
1588        RT_CACHE_STAT_INC(in_slow_tot);
1589
1590#ifdef CONFIG_IP_ROUTE_NAT
1591        /* Policy is applied before mapping destination,
1592           but rerouting after map should be made with old source.
1593         */
1594
1595        if (1) {
1596                u32 src_map = saddr;
1597                if (res.r)
1598                        src_map = fib_rules_policy(saddr, &res, &flags);
1599
1600                if (res.type == RTN_NAT) {
1601                        fl.fl4_dst = fib_rules_map_destination(daddr, &res);
1602                        fib_res_put(&res);
1603                        free_res = 0;
1604                        if (fib_lookup(&fl, &res))
1605                                goto e_inval;
1606                        free_res = 1;
1607                        if (res.type != RTN_UNICAST)
1608                                goto e_inval;
1609                        flags |= RTCF_DNAT;
1610                }
1611                fl.fl4_src = src_map;
1612        }
1613#endif
1614
1615        if (res.type == RTN_BROADCAST)
1616                goto brd_input;
1617
1618        if (res.type == RTN_LOCAL) {
1619                int result;
1620                result = fib_validate_source(saddr, daddr, tos,
1621                                             loopback_dev.ifindex,
1622                                             dev, &spec_dst, &itag);
1623                if (result < 0)
1624                        goto martian_source;
1625                if (result)
1626                        flags |= RTCF_DIRECTSRC;
1627                spec_dst = daddr;
1628                goto local_input;
1629        }
1630
1631        if (!IN_DEV_FORWARD(in_dev))
1632                goto e_inval;
1633        if (res.type != RTN_UNICAST)
1634                goto martian_destination;
1635
1636#ifdef CONFIG_IP_ROUTE_MULTIPATH
1637        if (res.fi->fib_nhs > 1 && fl.oif == 0)
1638                fib_select_multipath(&fl, &res);
1639#endif
1640        out_dev = in_dev_get(FIB_RES_DEV(res));
1641        if (out_dev == NULL) {
1642                if (net_ratelimit())
1643                        printk(KERN_CRIT "Bug in ip_route_input_slow(). "
1644                                         "Please, report\n");
1645                goto e_inval;
1646        }
1647
1648        err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev,
1649                                  &spec_dst, &itag);
1650        if (err < 0)
1651                goto martian_source;
1652
1653        if (err)
1654                flags |= RTCF_DIRECTSRC;
1655
1656        if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1657            (IN_DEV_SHARED_MEDIA(out_dev) ||
1658             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1659                flags |= RTCF_DOREDIRECT;
1660
1661        if (skb->protocol != htons(ETH_P_IP)) {
1662                /* Not IP (i.e. ARP). Do not create route, if it is
1663                 * invalid for proxy arp. DNAT routes are always valid.
1664                 */
1665                if (out_dev == in_dev && !(flags & RTCF_DNAT))
1666                        goto e_inval;
1667        }
1668
1669        rth = dst_alloc(&ipv4_dst_ops);
1670        if (!rth)
1671                goto e_nobufs;
1672
1673        atomic_set(&rth->u.dst.__refcnt, 1);
1674        rth->u.dst.flags= DST_HOST;
1675        if (in_dev->cnf.no_policy)
1676                rth->u.dst.flags |= DST_NOPOLICY;
1677        if (in_dev->cnf.no_xfrm)
1678                rth->u.dst.flags |= DST_NOXFRM;
1679        rth->fl.fl4_dst = daddr;
1680        rth->rt_dst     = daddr;
1681        rth->fl.fl4_tos = tos;
1682#ifdef CONFIG_IP_ROUTE_FWMARK
1683        rth->fl.fl4_fwmark= skb->nfmark;
1684#endif
1685        rth->fl.fl4_src = saddr;
1686        rth->rt_src     = saddr;
1687        rth->rt_gateway = daddr;
1688#ifdef CONFIG_IP_ROUTE_NAT
1689        rth->rt_src_map = fl.fl4_src;
1690        rth->rt_dst_map = fl.fl4_dst;
1691        if (flags&RTCF_DNAT)
1692                rth->rt_gateway = fl.fl4_dst;
1693#endif
1694        rth->rt_iif     =
1695        rth->fl.iif     = dev->ifindex;
1696        rth->u.dst.dev  = out_dev->dev;
1697        dev_hold(rth->u.dst.dev);
1698        rth->fl.oif     = 0;
1699        rth->rt_spec_dst= spec_dst;
1700
1701        rth->u.dst.input = ip_forward;
1702        rth->u.dst.output = ip_output;
1703
1704        rt_set_nexthop(rth, &res, itag);
1705
1706        rth->rt_flags = flags;
1707
1708#ifdef CONFIG_NET_FASTROUTE
1709        if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
1710                struct net_device *odev = rth->u.dst.dev;
1711                if (odev != dev &&
1712                    dev->accept_fastpath &&
1713                    odev->mtu >= dev->mtu &&
1714                    dev->accept_fastpath(dev, &rth->u.dst) == 0)
1715                        rth->rt_flags |= RTCF_FAST;
1716        }
1717#endif
1718
1719intern:
1720        err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1721done:
1722        in_dev_put(in_dev);
1723        if (out_dev)
1724                in_dev_put(out_dev);
1725        if (free_res)
1726                fib_res_put(&res);
1727out:    return err;
1728
1729brd_input:
1730        if (skb->protocol != htons(ETH_P_IP))
1731                goto e_inval;
1732
1733        if (ZERONET(saddr))
1734                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1735        else {
1736                err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1737                                          &itag);
1738                if (err < 0)
1739                        goto martian_source;
1740                if (err)
1741                        flags |= RTCF_DIRECTSRC;
1742        }
1743        flags |= RTCF_BROADCAST;
1744        res.type = RTN_BROADCAST;
1745        RT_CACHE_STAT_INC(in_brd);
1746
1747local_input:
1748        rth = dst_alloc(&ipv4_dst_ops);
1749        if (!rth)
1750                goto e_nobufs;
1751
1752        rth->u.dst.output= ip_rt_bug;
1753
1754        atomic_set(&rth->u.dst.__refcnt, 1);
1755        rth->u.dst.flags= DST_HOST;
1756        if (in_dev->cnf.no_policy)
1757                rth->u.dst.flags |= DST_NOPOLICY;
1758        rth->fl.fl4_dst = daddr;
1759        rth->rt_dst     = daddr;
1760        rth->fl.fl4_tos = tos;
1761#ifdef CONFIG_IP_ROUTE_FWMARK
1762        rth->fl.fl4_fwmark= skb->nfmark;
1763#endif
1764        rth->fl.fl4_src = saddr;
1765        rth->rt_src     = saddr;
1766#ifdef CONFIG_IP_ROUTE_NAT
1767        rth->rt_dst_map = fl.fl4_dst;
1768        rth->rt_src_map = fl.fl4_src;
1769#endif
1770#ifdef CONFIG_NET_CLS_ROUTE
1771        rth->u.dst.tclassid = itag;
1772#endif
1773        rth->rt_iif     =
1774        rth->fl.iif     = dev->ifindex;
1775        rth->u.dst.dev  = &loopback_dev;
1776        dev_hold(rth->u.dst.dev);
1777        rth->rt_gateway = daddr;
1778        rth->rt_spec_dst= spec_dst;
1779        rth->u.dst.input= ip_local_deliver;
1780        rth->rt_flags   = flags|RTCF_LOCAL;
1781        if (res.type == RTN_UNREACHABLE) {
1782                rth->u.dst.input= ip_error;
1783                rth->u.dst.error= -err;
1784                rth->rt_flags   &= ~RTCF_LOCAL;
1785        }
1786        rth->rt_type    = res.type;
1787        goto intern;
1788
1789no_route:
1790        RT_CACHE_STAT_INC(in_no_route);
1791        spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1792        res.type = RTN_UNREACHABLE;
1793        goto local_input;
1794
1795        /*
1796         *      Do not cache martian addresses: they should be logged (RFC1812)
1797         */
1798martian_destination:
1799        RT_CACHE_STAT_INC(in_martian_dst);
1800#ifdef CONFIG_IP_ROUTE_VERBOSE
1801        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1802                printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1803                        "%u.%u.%u.%u, dev %s\n",
1804                        NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1805#endif
1806e_inval:
1807        err = -EINVAL;
1808        goto done;
1809
1810e_nobufs:
1811        err = -ENOBUFS;
1812        goto done;
1813
1814martian_source:
1815
1816        RT_CACHE_STAT_INC(in_martian_src);
1817#ifdef CONFIG_IP_ROUTE_VERBOSE
1818        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1819                /*
1820                 *      RFC1812 recommendation, if source is martian,
1821                 *      the only hint is MAC header.
1822                 */
1823                printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1824                        "%u.%u.%u.%u, on dev %s\n",
1825                        NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1826                if (dev->hard_header_len) {
1827                        int i;
1828                        unsigned char *p = skb->mac.raw;
1829                        printk(KERN_WARNING "ll header: ");
1830                        for (i = 0; i < dev->hard_header_len; i++, p++) {
1831                                printk("%02x", *p);
1832                                if (i < (dev->hard_header_len - 1))
1833                                        printk(":");
1834                        }
1835                        printk("\n");
1836                }
1837        }
1838#endif
1839        goto e_inval;
1840}
1841
1842int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1843                   u8 tos, struct net_device *dev)
1844{
1845        struct rtable * rth;
1846        unsigned        hash;
1847        int iif = dev->ifindex;
1848
1849        tos &= IPTOS_RT_MASK;
1850        hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
1851
1852        rcu_read_lock();
1853        for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
1854                smp_read_barrier_depends();
1855                if (rth->fl.fl4_dst == daddr &&
1856                    rth->fl.fl4_src == saddr &&
1857                    rth->fl.iif == iif &&
1858                    rth->fl.oif == 0 &&
1859#ifdef CONFIG_IP_ROUTE_FWMARK
1860                    rth->fl.fl4_fwmark == skb->nfmark &&
1861#endif
1862                    rth->fl.fl4_tos == tos) {
1863                        rth->u.dst.lastuse = jiffies;
1864                        dst_hold(&rth->u.dst);
1865                        rth->u.dst.__use++;
1866                        RT_CACHE_STAT_INC(in_hit);
1867                        rcu_read_unlock();
1868                        skb->dst = (struct dst_entry*)rth;
1869                        return 0;
1870                }
1871                RT_CACHE_STAT_INC(in_hlist_search);
1872        }
1873        rcu_read_unlock();
1874
1875        /* Multicast recognition logic is moved from route cache to here.
1876           The problem was that too many Ethernet cards have broken/missing
1877           hardware multicast filters :-( As result the host on multicasting
1878           network acquires a lot of useless route cache entries, sort of
1879           SDR messages from all the world. Now we try to get rid of them.
1880           Really, provided software IP multicast filter is organized
1881           reasonably (at least, hashed), it does not result in a slowdown
1882           comparing with route cache reject entries.
1883           Note, that multicast routers are not affected, because
1884           route cache entry is created eventually.
1885         */
1886        if (MULTICAST(daddr)) {
1887                struct in_device *in_dev;
1888
1889                read_lock(&inetdev_lock);
1890                if ((in_dev = __in_dev_get(dev)) != NULL) {
1891                        int our = ip_check_mc(in_dev, daddr, saddr,
1892                                skb->nh.iph->protocol);
1893                        if (our
1894#ifdef CONFIG_IP_MROUTE
1895                            || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1896#endif
1897                            ) {
1898                                read_unlock(&inetdev_lock);
1899                                return ip_route_input_mc(skb, daddr, saddr,
1900                                                         tos, dev, our);
1901                        }
1902                }
1903                read_unlock(&inetdev_lock);
1904                return -EINVAL;
1905        }
1906        return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1907}
1908
1909/*
1910 * Major route resolver routine.
1911 */
1912
1913int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
1914{
1915        u32 tos = oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK);
1916        struct flowi fl = { .nl_u = { .ip4_u =
1917                                      { .daddr = oldflp->fl4_dst,
1918                                        .saddr = oldflp->fl4_src,
1919                                        .tos = tos & IPTOS_RT_MASK,
1920                                        .scope = ((tos & RTO_ONLINK) ?
1921                                                  RT_SCOPE_LINK :
1922                                                  RT_SCOPE_UNIVERSE),
1923#ifdef CONFIG_IP_ROUTE_FWMARK
1924                                        .fwmark = oldflp->fl4_fwmark
1925#endif
1926                                      } },
1927                            .iif = loopback_dev.ifindex,
1928                            .oif = oldflp->oif };
1929        struct fib_result res;
1930        unsigned flags = 0;
1931        struct rtable *rth;
1932        struct net_device *dev_out = NULL;
1933        struct in_device *in_dev = NULL;
1934        unsigned hash;
1935        int free_res = 0;
1936        int err;
1937
1938        res.fi          = NULL;
1939#ifdef CONFIG_IP_MULTIPLE_TABLES
1940        res.r           = NULL;
1941#endif
1942
1943        if (oldflp->fl4_src) {
1944                err = -EINVAL;
1945                if (MULTICAST(oldflp->fl4_src) ||
1946                    BADCLASS(oldflp->fl4_src) ||
1947                    ZERONET(oldflp->fl4_src))
1948                        goto out;
1949
1950                /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1951                dev_out = ip_dev_find(oldflp->fl4_src);
1952                if (dev_out == NULL)
1953                        goto out;
1954
1955                /* I removed check for oif == dev_out->oif here.
1956                   It was wrong for two reasons:
1957                   1. ip_dev_find(saddr) can return wrong iface, if saddr is
1958                      assigned to multiple interfaces.
1959                   2. Moreover, we are allowed to send packets with saddr
1960                      of another iface. --ANK
1961                 */
1962
1963                if (oldflp->oif == 0
1964                    && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
1965                        /* Special hack: user can direct multicasts
1966                           and limited broadcast via necessary interface
1967                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1968                           This hack is not just for fun, it allows
1969                           vic,vat and friends to work.
1970                           They bind socket to loopback, set ttl to zero
1971                           and expect that it will work.
1972                           From the viewpoint of routing cache they are broken,
1973                           because we are not allowed to build multicast path
1974                           with loopback source addr (look, routing cache
1975                           cannot know, that ttl is zero, so that packet
1976                           will not leave this host and route is valid).
1977                           Luckily, this hack is good workaround.
1978                         */
1979
1980                        fl.oif = dev_out->ifindex;
1981                        goto make_route;
1982                }
1983                if (dev_out)
1984                        dev_put(dev_out);
1985                dev_out = NULL;
1986        }
1987        if (oldflp->oif) {
1988                dev_out = dev_get_by_index(oldflp->oif);
1989                err = -ENODEV;
1990                if (dev_out == NULL)
1991                        goto out;
1992                if (__in_dev_get(dev_out) == NULL) {
1993                        dev_put(dev_out);
1994                        goto out;       /* Wrong error code */
1995                }
1996
1997                if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
1998                        if (!fl.fl4_src)
1999                                fl.fl4_src = inet_select_addr(dev_out, 0,
2000                                                              RT_SCOPE_LINK);
2001                        goto make_route;
2002                }
2003                if (!fl.fl4_src) {
2004                        if (MULTICAST(oldflp->fl4_dst))
2005                                fl.fl4_src = inet_select_addr(dev_out, 0,
2006                                                              fl.fl4_scope);
2007                        else if (!oldflp->fl4_dst)
2008                                fl.fl4_src = inet_select_addr(dev_out, 0,
2009                                                              RT_SCOPE_HOST);
2010                }
2011        }
2012
2013        if (!fl.fl4_dst) {
2014                fl.fl4_dst = fl.fl4_src;
2015                if (!fl.fl4_dst)
2016                        fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2017                if (dev_out)
2018                        dev_put(dev_out);
2019                dev_out = &loopback_dev;
2020                dev_hold(dev_out);
2021                fl.oif = loopback_dev.ifindex;
2022                res.type = RTN_LOCAL;
2023                flags |= RTCF_LOCAL;
2024                goto make_route;
2025        }
2026
2027        if (fib_lookup(&fl, &res)) {
2028                res.fi = NULL;
2029                if (oldflp->oif) {
2030                        /* Apparently, routing tables are wrong. Assume,
2031                           that the destination is on link.
2032
2033                           WHY? DW.
2034                           Because we are allowed to send to iface
2035                           even if it has NO routes and NO assigned
2036                           addresses. When oif is specified, routing
2037                           tables are looked up with only one purpose:
2038                           to catch if destination is gatewayed, rather than
2039                           direct. Moreover, if MSG_DONTROUTE is set,
2040                           we send packet, ignoring both routing tables
2041                           and ifaddr state. --ANK
2042
2043
2044                           We could make it even if oif is unknown,
2045                           likely IPv6, but we do not.
2046                         */
2047
2048                        if (fl.fl4_src == 0)
2049                                fl.fl4_src = inet_select_addr(dev_out, 0,
2050                                                              RT_SCOPE_LINK);
2051                        res.type = RTN_UNICAST;
2052                        goto make_route;
2053                }
2054                if (dev_out)
2055                        dev_put(dev_out);
2056                err = -ENETUNREACH;
2057                goto out;
2058        }
2059        free_res = 1;
2060
2061        if (res.type == RTN_NAT)
2062                goto e_inval;
2063
2064        if (res.type == RTN_LOCAL) {
2065                if (!fl.fl4_src)
2066                        fl.fl4_src = fl.fl4_dst;
2067                if (dev_out)
2068                        dev_put(dev_out);
2069                dev_out = &loopback_dev;
2070                dev_hold(dev_out);
2071                fl.oif = dev_out->ifindex;
2072                if (res.fi)
2073                        fib_info_put(res.fi);
2074                res.fi = NULL;
2075                flags |= RTCF_LOCAL;
2076                goto make_route;
2077        }
2078
2079#ifdef CONFIG_IP_ROUTE_MULTIPATH
2080        if (res.fi->fib_nhs > 1 && fl.oif == 0)
2081                fib_select_multipath(&fl, &res);
2082        else
2083#endif
2084        if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2085                fib_select_default(&fl, &res);
2086
2087        if (!fl.fl4_src)
2088                fl.fl4_src = FIB_RES_PREFSRC(res);
2089
2090        if (dev_out)
2091                dev_put(dev_out);
2092        dev_out = FIB_RES_DEV(res);
2093        dev_hold(dev_out);
2094        fl.oif = dev_out->ifindex;
2095
2096make_route:
2097        if (LOOPBACK(fl.fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2098                goto e_inval;
2099
2100        if (fl.fl4_dst == 0xFFFFFFFF)
2101                res.type = RTN_BROADCAST;
2102        else if (MULTICAST(fl.fl4_dst))
2103                res.type = RTN_MULTICAST;
2104        else if (BADCLASS(fl.fl4_dst) || ZERONET(fl.fl4_dst))
2105                goto e_inval;
2106
2107        if (dev_out->flags & IFF_LOOPBACK)
2108                flags |= RTCF_LOCAL;
2109
2110        in_dev = in_dev_get(dev_out);
2111        if (!in_dev)
2112                goto e_inval;
2113
2114        if (res.type == RTN_BROADCAST) {
2115                flags |= RTCF_BROADCAST | RTCF_LOCAL;
2116                if (res.fi) {
2117                        fib_info_put(res.fi);
2118                        res.fi = NULL;
2119                }
2120        } else if (res.type == RTN_MULTICAST) {
2121                flags |= RTCF_MULTICAST|RTCF_LOCAL;
2122                if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, oldflp->proto))
2123                        flags &= ~RTCF_LOCAL;
2124                /* If multicast route do not exist use
2125                   default one, but do not gateway in this case.
2126                   Yes, it is hack.
2127                 */
2128                if (res.fi && res.prefixlen < 4) {
2129                        fib_info_put(res.fi);
2130                        res.fi = NULL;
2131                }
2132        }
2133
2134        rth = dst_alloc(&ipv4_dst_ops);
2135        if (!rth)
2136                goto e_nobufs;
2137
2138        atomic_set(&rth->u.dst.__refcnt, 1);
2139        rth->u.dst.flags= DST_HOST;
2140        if (in_dev->cnf.no_xfrm)
2141                rth->u.dst.flags |= DST_NOXFRM;
2142        if (in_dev->cnf.no_policy)
2143                rth->u.dst.flags |= DST_NOPOLICY;
2144        rth->fl.fl4_dst = oldflp->fl4_dst;
2145        rth->fl.fl4_tos = tos;
2146        rth->fl.fl4_src = oldflp->fl4_src;
2147        rth->fl.oif     = oldflp->oif;
2148#ifdef CONFIG_IP_ROUTE_FWMARK
2149        rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2150#endif
2151        rth->rt_dst     = fl.fl4_dst;
2152        rth->rt_src     = fl.fl4_src;
2153#ifdef CONFIG_IP_ROUTE_NAT
2154        rth->rt_dst_map = fl.fl4_dst;
2155        rth->rt_src_map = fl.fl4_src;
2156#endif
2157        rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2158        rth->u.dst.dev  = dev_out;
2159        dev_hold(dev_out);
2160        rth->rt_gateway = fl.fl4_dst;
2161        rth->rt_spec_dst= fl.fl4_src;
2162
2163        rth->u.dst.output=ip_output;
2164
2165        RT_CACHE_STAT_INC(out_slow_tot);
2166
2167        if (flags & RTCF_LOCAL) {
2168                rth->u.dst.input = ip_local_deliver;
2169                rth->rt_spec_dst = fl.fl4_dst;
2170        }
2171        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2172                rth->rt_spec_dst = fl.fl4_src;
2173                if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) {
2174                        rth->u.dst.output = ip_mc_output;
2175                        RT_CACHE_STAT_INC(out_slow_mc);
2176                }
2177#ifdef CONFIG_IP_MROUTE
2178                if (res.type == RTN_MULTICAST) {
2179                        if (IN_DEV_MFORWARD(in_dev) &&
2180                            !LOCAL_MCAST(oldflp->fl4_dst)) {
2181                                rth->u.dst.input = ip_mr_input;
2182                                rth->u.dst.output = ip_mc_output;
2183                        }
2184                }
2185#endif
2186        }
2187
2188        rt_set_nexthop(rth, &res, 0);
2189        
2190
2191        rth->rt_flags = flags;
2192
2193        hash = rt_hash_code(oldflp->fl4_dst, oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2194        err = rt_intern_hash(hash, rth, rp);
2195done:
2196        if (free_res)
2197                fib_res_put(&res);
2198        if (dev_out)
2199                dev_put(dev_out);
2200        if (in_dev)
2201                in_dev_put(in_dev);
2202out:    return err;
2203
2204e_inval:
2205        err = -EINVAL;
2206        goto done;
2207e_nobufs:
2208        err = -ENOBUFS;
2209        goto done;
2210}
2211
2212int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2213{
2214        unsigned hash;
2215        struct rtable *rth;
2216
2217        hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2218
2219        rcu_read_lock();
2220        for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
2221                smp_read_barrier_depends();
2222                if (rth->fl.fl4_dst == flp->fl4_dst &&
2223                    rth->fl.fl4_src == flp->fl4_src &&
2224                    rth->fl.iif == 0 &&
2225                    rth->fl.oif == flp->oif &&
2226#ifdef CONFIG_IP_ROUTE_FWMARK
2227                    rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2228#endif
2229                    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2230                            (IPTOS_RT_MASK | RTO_ONLINK))) {
2231                        rth->u.dst.lastuse = jiffies;
2232                        dst_hold(&rth->u.dst);
2233                        rth->u.dst.__use++;
2234                        RT_CACHE_STAT_INC(out_hit);
2235                        rcu_read_unlock();
2236                        *rp = rth;
2237                        return 0;
2238                }
2239                RT_CACHE_STAT_INC(out_hlist_search);
2240        }
2241        rcu_read_unlock();
2242
2243        return ip_route_output_slow(rp, flp);
2244}
2245
2246int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2247{
2248        int err;
2249
2250        if ((err = __ip_route_output_key(rp, flp)) != 0)
2251                return err;
2252        return flp->proto ? xfrm_lookup((struct dst_entry**)rp, flp, NULL, 0) : 0;
2253}
2254
2255int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2256{
2257        int err;
2258
2259        if ((err = __ip_route_output_key(rp, flp)) != 0)
2260                return err;
2261        return flp->proto ? xfrm_lookup((struct dst_entry**)rp, flp, sk, flags) : 0;
2262}
2263
2264static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2265                        int nowait)
2266{
2267        struct rtable *rt = (struct rtable*)skb->dst;
2268        struct rtmsg *r;
2269        struct nlmsghdr  *nlh;
2270        unsigned char    *b = skb->tail;
2271        struct rta_cacheinfo ci;
2272#ifdef CONFIG_IP_MROUTE
2273        struct rtattr *eptr;
2274#endif
2275        nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
2276        r = NLMSG_DATA(nlh);
2277        nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2278        r->rtm_family    = AF_INET;
2279        r->rtm_dst_len  = 32;
2280        r->rtm_src_len  = 0;
2281        r->rtm_tos      = rt->fl.fl4_tos;
2282        r->rtm_table    = RT_TABLE_MAIN;
2283        r->rtm_type     = rt->rt_type;
2284        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2285        r->rtm_protocol = RTPROT_UNSPEC;
2286        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2287        if (rt->rt_flags & RTCF_NOTIFY)
2288                r->rtm_flags |= RTM_F_NOTIFY;
2289        RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2290        if (rt->fl.fl4_src) {
2291                r->rtm_src_len = 32;
2292                RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2293        }
2294        if (rt->u.dst.dev)
2295                RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2296#ifdef CONFIG_NET_CLS_ROUTE
2297        if (rt->u.dst.tclassid)
2298                RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2299#endif
2300        if (rt->fl.iif)
2301                RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2302        else if (rt->rt_src != rt->fl.fl4_src)
2303                RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2304        if (rt->rt_dst != rt->rt_gateway)
2305                RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2306        if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2307                goto rtattr_failure;
2308        ci.rta_lastuse  = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2309        ci.rta_used     = rt->u.dst.__use;
2310        ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2311        if (rt->u.dst.expires)
2312                ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2313        else
2314                ci.rta_expires = 0;
2315        ci.rta_error    = rt->u.dst.error;
2316        ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2317        if (rt->peer) {
2318                ci.rta_id = rt->peer->ip_id_count;
2319                if (rt->peer->tcp_ts_stamp) {
2320                        ci.rta_ts = rt->peer->tcp_ts;
2321                        ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2322                }
2323        }
2324#ifdef CONFIG_IP_MROUTE
2325        eptr = (struct rtattr*)skb->tail;
2326#endif
2327        RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2328        if (rt->fl.iif) {
2329#ifdef CONFIG_IP_MROUTE
2330                u32 dst = rt->rt_dst;
2331
2332                if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2333                    ipv4_devconf.mc_forwarding) {
2334                        int err = ipmr_get_route(skb, r, nowait);
2335                        if (err <= 0) {
2336                                if (!nowait) {
2337                                        if (err == 0)
2338                                                return 0;
2339                                        goto nlmsg_failure;
2340                                } else {
2341                                        if (err == -EMSGSIZE)
2342                                                goto nlmsg_failure;
2343                                        ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2344                                }
2345                        }
2346                } else
2347#endif
2348                        RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2349        }
2350
2351        nlh->nlmsg_len = skb->tail - b;
2352        return skb->len;
2353
2354nlmsg_failure:
2355rtattr_failure:
2356        skb_trim(skb, b - skb->data);
2357        return -1;
2358}
2359
2360int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2361{
2362        struct rtattr **rta = arg;
2363        struct rtmsg *rtm = NLMSG_DATA(nlh);
2364        struct rtable *rt = NULL;
2365        u32 dst = 0;
2366        u32 src = 0;
2367        int iif = 0;
2368        int err = -ENOBUFS;
2369        struct sk_buff *skb;
2370
2371        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2372        if (!skb)
2373                goto out;
2374
2375        /* Reserve room for dummy headers, this skb can pass
2376           through good chunk of routing engine.
2377         */
2378        skb->mac.raw = skb->data;
2379        skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2380
2381        if (rta[RTA_SRC - 1])
2382                memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2383        if (rta[RTA_DST - 1])
2384                memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2385        if (rta[RTA_IIF - 1])
2386                memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2387
2388        if (iif) {
2389                struct net_device *dev = __dev_get_by_index(iif);
2390                err = -ENODEV;
2391                if (!dev)
2392                        goto out_free;
2393                skb->protocol   = htons(ETH_P_IP);
2394                skb->dev        = dev;
2395                local_bh_disable();
2396                err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2397                local_bh_enable();
2398                rt = (struct rtable*)skb->dst;
2399                if (!err && rt->u.dst.error)
2400                        err = -rt->u.dst.error;
2401        } else {
2402                struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2403                                                         .saddr = src,
2404                                                         .tos = rtm->rtm_tos } } };
2405                int oif = 0;
2406                if (rta[RTA_OIF - 1])
2407                        memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2408                fl.oif = oif;
2409                err = ip_route_output_key(&rt, &fl);
2410        }
2411        if (err)
2412                goto out_free;
2413
2414        skb->dst = &rt->u.dst;
2415        if (rtm->rtm_flags & RTM_F_NOTIFY)
2416                rt->rt_flags |= RTCF_NOTIFY;
2417
2418        NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2419
2420        err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2421                                RTM_NEWROUTE, 0);
2422        if (!err)
2423                goto out_free;
2424        if (err < 0) {
2425                err = -EMSGSIZE;
2426                goto out_free;
2427        }
2428
2429        err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2430        if (err > 0)
2431                err = 0;
2432out:    return err;
2433
2434out_free:
2435        kfree_skb(skb);
2436        goto out;
2437}
2438
2439int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2440{
2441        struct rtable *rt;
2442        int h, s_h;
2443        int idx, s_idx;
2444
2445        s_h = cb->args[0];
2446        s_idx = idx = cb->args[1];
2447        for (h = 0; h <= rt_hash_mask; h++) {
2448                if (h < s_h) continue;
2449                if (h > s_h)
2450                        s_idx = 0;
2451                rcu_read_lock();
2452                for (rt = rt_hash_table[h].chain, idx = 0; rt;
2453                     rt = rt->u.rt_next, idx++) {
2454                        smp_read_barrier_depends();
2455                        if (idx < s_idx)
2456                                continue;
2457                        skb->dst = dst_clone(&rt->u.dst);
2458                        if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2459                                         cb->nlh->nlmsg_seq,
2460                                         RTM_NEWROUTE, 1) <= 0) {
2461                                dst_release(xchg(&skb->dst, NULL));
2462                                rcu_read_unlock();
2463                                goto done;
2464                        }
2465                        dst_release(xchg(&skb->dst, NULL));
2466                }
2467                rcu_read_unlock();
2468        }
2469
2470done:
2471        cb->args[0] = h;
2472        cb->args[1] = idx;
2473        return skb->len;
2474}
2475
2476void ip_rt_multicast_event(struct in_device *in_dev)
2477{
2478        rt_cache_flush(0);
2479}
2480
2481#ifdef CONFIG_SYSCTL
2482static int flush_delay;
2483
2484static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2485                                        struct file *filp, void *buffer,
2486                                        size_t *lenp)
2487{
2488        if (write) {
2489                proc_dointvec(ctl, write, filp, buffer, lenp);
2490                rt_cache_flush(flush_delay);
2491                return 0;
2492        } 
2493
2494        return -EINVAL;
2495}
2496
2497static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, int *name,
2498                                                int nlen, void *oldval,
2499                                                size_t *oldlenp, void *newval,
2500                                                size_t newlen, void **context)
2501{
2502        int delay;
2503        if (newlen != sizeof(int))
2504                return -EINVAL;
2505        if (get_user(delay, (int *)newval))
2506                return -EFAULT; 
2507        rt_cache_flush(delay); 
2508        return 0;
2509}
2510
2511ctl_table ipv4_route_table[] = {
2512        {
2513                .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2514                .procname       = "flush",
2515                .data           = &flush_delay,
2516                .maxlen         = sizeof(int),
2517                .mode           = 0644,
2518                .proc_handler   = &ipv4_sysctl_rtcache_flush,
2519                .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2520        },
2521        {
2522                .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2523                .procname       = "min_delay",
2524                .data           = &ip_rt_min_delay,
2525                .maxlen         = sizeof(int),
2526                .mode           = 0644,
2527                .proc_handler   = &proc_dointvec_jiffies,
2528                .strategy       = &sysctl_jiffies,
2529        },
2530        {
2531                .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2532                .procname       = "max_delay",
2533                .data           = &ip_rt_max_delay,
2534                .maxlen         = sizeof(int),
2535                .mode           = 0644,
2536                .proc_handler   = &proc_dointvec_jiffies,
2537                .strategy       = &sysctl_jiffies,
2538        },
2539        {
2540                .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2541                .procname       = "gc_thresh",
2542                .data           = &ipv4_dst_ops.gc_thresh,
2543                .maxlen         = sizeof(int),
2544                .mode           = 0644,
2545                .proc_handler   = &proc_dointvec,
2546        },
2547        {
2548                .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2549                .procname       = "max_size",
2550                .data           = &ip_rt_max_size,
2551                .maxlen         = sizeof(int),
2552                .mode           = 0644,
2553                .proc_handler   = &proc_dointvec,
2554        },
2555        {
2556                .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2557                .procname       = "gc_min_interval",
2558                .data           = &ip_rt_gc_min_interval,
2559                .maxlen         = sizeof(int),
2560                .mode           = 0644,
2561                .proc_handler   = &proc_dointvec_jiffies,
2562                .strategy       = &sysctl_jiffies,
2563        },
2564        {
2565                .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2566                .procname       = "gc_timeout",
2567                .data           = &ip_rt_gc_timeout,
2568                .maxlen         = sizeof(int),
2569                .mode           = 0644,
2570                .proc_handler   = &proc_dointvec_jiffies,
2571                .strategy       = &sysctl_jiffies,
2572        },
2573        {
2574                .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2575                .procname       = "gc_interval",
2576                .data           = &ip_rt_gc_interval,
2577                .maxlen         = sizeof(int),
2578                .mode           = 0644,
2579                .proc_handler   = &proc_dointvec_jiffies,
2580                .strategy       = &sysctl_jiffies,
2581        },
2582        {
2583                .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2584                .procname       = "redirect_load",
2585                .data           = &ip_rt_redirect_load,
2586                .maxlen         = sizeof(int),
2587                .mode           = 0644,
2588                .proc_handler   = &proc_dointvec,
2589        },
2590        {
2591                .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2592                .procname       = "redirect_number",
2593                .data           = &ip_rt_redirect_number,
2594                .maxlen         = sizeof(int),
2595                .mode           = 0644,
2596                .proc_handler   = &proc_dointvec,
2597        },
2598        {
2599                .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2600                .procname       = "redirect_silence",
2601                .data           = &ip_rt_redirect_silence,
2602                .maxlen         = sizeof(int),
2603                .mode           = 0644,
2604                .proc_handler   = &proc_dointvec,
2605        },
2606        {
2607                .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2608                .procname       = "error_cost",
2609                .data           = &ip_rt_error_cost,
2610                .maxlen         = sizeof(int),
2611                .mode           = 0644,
2612                .proc_handler   = &proc_dointvec,
2613        },
2614        {
2615                .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2616                .procname       = "error_burst",
2617                .data           = &ip_rt_error_burst,
2618                .maxlen         = sizeof(int),
2619                .mode           = 0644,
2620                .proc_handler   = &proc_dointvec,
2621        },
2622        {
2623                .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2624                .procname       = "gc_elasticity",
2625                .data           = &ip_rt_gc_elasticity,
2626                .maxlen         = sizeof(int),
2627                .mode           = 0644,
2628                .proc_handler   = &proc_dointvec,
2629        },
2630        {
2631                .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2632                .procname       = "mtu_expires",
2633                .data           = &ip_rt_mtu_expires,
2634                .maxlen         = sizeof(int),
2635                .mode           = 0644,
2636                .proc_handler   = &proc_dointvec_jiffies,
2637                .strategy       = &sysctl_jiffies,
2638        },
2639        {
2640                .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2641                .procname       = "min_pmtu",
2642                .data           = &ip_rt_min_pmtu,
2643                .maxlen         = sizeof(int),
2644                .mode           = 0644,
2645                .proc_handler   = &proc_dointvec,
2646        },
2647        {
2648                .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2649                .procname       = "min_adv_mss",
2650                .data           = &ip_rt_min_advmss,
2651                .maxlen         = sizeof(int),
2652                .mode           = 0644,
2653                .proc_handler   = &proc_dointvec,
2654        },
2655        {
2656                .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2657                .procname       = "secret_interval",
2658                .data           = &ip_rt_secret_interval,
2659                .maxlen         = sizeof(int),
2660                .mode           = 0644,
2661                .proc_handler   = &proc_dointvec_jiffies,
2662                .strategy       = &sysctl_jiffies,
2663        },
2664        { .ctl_name = 0 }
2665};
2666#endif
2667
2668#ifdef CONFIG_NET_CLS_ROUTE
2669struct ip_rt_acct *ip_rt_acct;
2670
2671/* This code sucks.  But you should have seen it before! --RR */
2672
2673/* IP route accounting ptr for this logical cpu number. */
2674#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2675
2676#ifdef CONFIG_PROC_FS
2677static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2678                           int length, int *eof, void *data)
2679{
2680        unsigned int i;
2681
2682        if ((offset & 3) || (length & 3))
2683                return -EIO;
2684
2685        if (offset >= sizeof(struct ip_rt_acct) * 256) {
2686                *eof = 1;
2687                return 0;
2688        }
2689
2690        if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2691                length = sizeof(struct ip_rt_acct) * 256 - offset;
2692                *eof = 1;
2693        }
2694
2695        offset /= sizeof(u32);
2696
2697        if (length > 0) {
2698                u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2699                u32 *dst = (u32 *) buffer;
2700
2701                /* Copy first cpu. */
2702                *start = buffer;
2703                memcpy(dst, src, length);
2704
2705                /* Add the other cpus in, one int at a time */
2706                for (i = 1; i < NR_CPUS; i++) {
2707                        unsigned int j;
2708
2709                        if (!cpu_online(i))
2710                                continue;
2711
2712                        src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2713
2714                        for (j = 0; j < length/4; j++)
2715                                dst[j] += src[j];
2716                }
2717        }
2718        return length;
2719}
2720#endif /* CONFIG_PROC_FS */
2721#endif /* CONFIG_NET_CLS_ROUTE */
2722
2723int __init ip_rt_init(void)
2724{
2725        int i, order, goal, rc = 0;
2726
2727        rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2728                             (jiffies ^ (jiffies >> 7)));
2729
2730#ifdef CONFIG_NET_CLS_ROUTE
2731        for (order = 0;
2732             (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2733                /* NOTHING */;
2734        ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2735        if (!ip_rt_acct)
2736                panic("IP: failed to allocate ip_rt_acct\n");
2737        memset(ip_rt_acct, 0, PAGE_SIZE << order);
2738#endif
2739
2740        ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
2741                                                     sizeof(struct rtable),
2742                                                     0, SLAB_HWCACHE_ALIGN,
2743                                                     NULL, NULL);
2744
2745        if (!ipv4_dst_ops.kmem_cachep)
2746                panic("IP: failed to allocate ip_dst_cache\n");
2747
2748        goal = num_physpages >> (26 - PAGE_SHIFT);
2749
2750        for (order = 0; (1UL << order) < goal; order++)
2751                /* NOTHING */;
2752
2753        do {
2754                rt_hash_mask = (1UL << order) * PAGE_SIZE /
2755                        sizeof(struct rt_hash_bucket);
2756                while (rt_hash_mask & (rt_hash_mask - 1))
2757                        rt_hash_mask--;
2758                rt_hash_table = (struct rt_hash_bucket *)
2759                        __get_free_pages(GFP_ATOMIC, order);
2760        } while (rt_hash_table == NULL && --order > 0);
2761
2762        if (!rt_hash_table)
2763                panic("Failed to allocate IP route cache hash table\n");
2764
2765        printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
2766               rt_hash_mask,
2767               (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
2768
2769        for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
2770                /* NOTHING */;
2771
2772        rt_hash_mask--;
2773        for (i = 0; i <= rt_hash_mask; i++) {
2774                rt_hash_table[i].lock = SPIN_LOCK_UNLOCKED;
2775                rt_hash_table[i].chain = NULL;
2776        }
2777
2778        ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2779        ip_rt_max_size = (rt_hash_mask + 1) * 16;
2780
2781        rt_cache_stat = alloc_percpu(struct rt_cache_stat);
2782        if (!rt_cache_stat)
2783                return -ENOMEM;
2784
2785        devinet_init();
2786        ip_fib_init();
2787
2788        init_timer(&rt_flush_timer);
2789        rt_flush_timer.function = rt_run_flush;
2790        init_timer(&rt_periodic_timer);
2791        rt_periodic_timer.function = rt_check_expire;
2792        init_timer(&rt_secret_timer);
2793        rt_secret_timer.function = rt_secret_rebuild;
2794
2795        /* All the timers, started at system startup tend
2796           to synchronize. Perturb it a bit.
2797         */
2798        rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
2799                                        ip_rt_gc_interval;
2800        add_timer(&rt_periodic_timer);
2801
2802        rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2803                ip_rt_secret_interval;
2804        add_timer(&rt_secret_timer);
2805
2806#ifdef CONFIG_PROC_FS
2807        if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
2808            !proc_net_fops_create("rt_cache_stat", S_IRUGO, &rt_cpu_seq_fops)) {
2809                free_percpu(rt_cache_stat);
2810                return -ENOMEM;
2811        }
2812
2813#ifdef CONFIG_NET_CLS_ROUTE
2814        create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
2815#endif
2816#endif
2817#ifdef CONFIG_XFRM
2818        xfrm_init();
2819        xfrm4_init();
2820#endif
2821        return rc;
2822}
2823
2824EXPORT_SYMBOL(__ip_select_ident);
2825EXPORT_SYMBOL(ip_route_input);
2826EXPORT_SYMBOL(ip_route_output_key);
2827
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.