linux-old/net/ipv4/route.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              ROUTE - implementation of the IP router.
   7 *
   8 * Version:     $Id: route.c,v 1.102.2.1 2002/01/12 07:43:57 davem Exp $
   9 *
  10 * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  13 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  14 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  15 *
  16 * Fixes:
  17 *              Alan Cox        :       Verify area fixes.
  18 *              Alan Cox        :       cli() protects routing changes
  19 *              Rui Oliveira    :       ICMP routing table updates
  20 *              (rco@di.uminho.pt)      Routing table insertion and update
  21 *              Linus Torvalds  :       Rewrote bits to be sensible
  22 *              Alan Cox        :       Added BSD route gw semantics
  23 *              Alan Cox        :       Super /proc >4K 
  24 *              Alan Cox        :       MTU in route table
  25 *              Alan Cox        :       MSS actually. Also added the window
  26 *                                      clamper.
  27 *              Sam Lantinga    :       Fixed route matching in rt_del()
  28 *              Alan Cox        :       Routing cache support.
  29 *              Alan Cox        :       Removed compatibility cruft.
  30 *              Alan Cox        :       RTF_REJECT support.
  31 *              Alan Cox        :       TCP irtt support.
  32 *              Jonathan Naylor :       Added Metric support.
  33 *      Miquel van Smoorenburg  :       BSD API fixes.
  34 *      Miquel van Smoorenburg  :       Metrics.
  35 *              Alan Cox        :       Use __u32 properly
  36 *              Alan Cox        :       Aligned routing errors more closely with BSD
  37 *                                      our system is still very different.
  38 *              Alan Cox        :       Faster /proc handling
  39 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  40 *                                      routing caches and better behaviour.
  41 *              
  42 *              Olaf Erb        :       irtt wasn't being copied right.
  43 *              Bjorn Ekwall    :       Kerneld route support.
  44 *              Alan Cox        :       Multicast fixed (I hope)
  45 *              Pavel Krauz     :       Limited broadcast fixed
  46 *              Mike McLagan    :       Routing by source
  47 *      Alexey Kuznetsov        :       End of old history. Splitted to fib.c and
  48 *                                      route.c and rewritten from scratch.
  49 *              Andi Kleen      :       Load-limit warning messages.
  50 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  51 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  52 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  53 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  54 *              Marc Boucher    :       routing by fwmark
  55 *      Robert Olsson           :       Added rt_cache statistics
  56 *
  57 *              This program is free software; you can redistribute it and/or
  58 *              modify it under the terms of the GNU General Public License
  59 *              as published by the Free Software Foundation; either version
  60 *              2 of the License, or (at your option) any later version.
  61 */
  62
  63#include <linux/config.h>
  64#include <asm/uaccess.h>
  65#include <asm/system.h>
  66#include <asm/bitops.h>
  67#include <linux/types.h>
  68#include <linux/kernel.h>
  69#include <linux/sched.h>
  70#include <linux/mm.h>
  71#include <linux/string.h>
  72#include <linux/socket.h>
  73#include <linux/sockios.h>
  74#include <linux/errno.h>
  75#include <linux/in.h>
  76#include <linux/inet.h>
  77#include <linux/netdevice.h>
  78#include <linux/proc_fs.h>
  79#include <linux/init.h>
  80#include <linux/skbuff.h>
  81#include <linux/rtnetlink.h>
  82#include <linux/inetdevice.h>
  83#include <linux/igmp.h>
  84#include <linux/pkt_sched.h>
  85#include <linux/mroute.h>
  86#include <linux/netfilter_ipv4.h>
  87#include <linux/random.h>
  88#include <linux/jhash.h>
  89#include <net/protocol.h>
  90#include <net/ip.h>
  91#include <net/route.h>
  92#include <net/inetpeer.h>
  93#include <net/sock.h>
  94#include <net/ip_fib.h>
  95#include <net/arp.h>
  96#include <net/tcp.h>
  97#include <net/icmp.h>
  98#ifdef CONFIG_SYSCTL
  99#include <linux/sysctl.h>
 100#endif
 101
 102#define IP_MAX_MTU      0xFFF0
 103
 104#define RT_GC_TIMEOUT (300*HZ)
 105
 106int ip_rt_min_delay             = 2 * HZ;
 107int ip_rt_max_delay             = 10 * HZ;
 108int ip_rt_max_size;
 109int ip_rt_gc_timeout            = RT_GC_TIMEOUT;
 110int ip_rt_gc_interval           = 60 * HZ;
 111int ip_rt_gc_min_interval       = HZ / 2;
 112int ip_rt_redirect_number       = 9;
 113int ip_rt_redirect_load         = HZ / 50;
 114int ip_rt_redirect_silence      = ((HZ / 50) << (9 + 1));
 115int ip_rt_error_cost            = HZ;
 116int ip_rt_error_burst           = 5 * HZ;
 117int ip_rt_gc_elasticity         = 8;
 118int ip_rt_mtu_expires           = 10 * 60 * HZ;
 119int ip_rt_min_pmtu              = 512 + 20 + 20;
 120int ip_rt_min_advmss            = 256;
 121int ip_rt_secret_interval       = 10 * 60 * HZ;
 122static unsigned long rt_deadline;
 123
 124#define RTprint(a...)   printk(KERN_DEBUG a)
 125
 126static struct timer_list rt_flush_timer;
 127static struct timer_list rt_periodic_timer;
 128static struct timer_list rt_secret_timer;
 129
 130/*
 131 *      Interface to generic destination cache.
 132 */
 133
 134static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 135static struct dst_entry *ipv4_dst_reroute(struct dst_entry *dst,
 136                                           struct sk_buff *skb);
 137static void              ipv4_dst_destroy(struct dst_entry *dst);
 138static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 139static void              ipv4_link_failure(struct sk_buff *skb);
 140static int rt_garbage_collect(void);
 141
 142
 143struct dst_ops ipv4_dst_ops = {
 144        family:                 AF_INET,
 145        protocol:               __constant_htons(ETH_P_IP),
 146        gc:                     rt_garbage_collect,
 147        check:                  ipv4_dst_check,
 148        reroute:                ipv4_dst_reroute,
 149        destroy:                ipv4_dst_destroy,
 150        negative_advice:        ipv4_negative_advice,
 151        link_failure:           ipv4_link_failure,
 152        entry_size:             sizeof(struct rtable),
 153};
 154
 155#define ECN_OR_COST(class)      TC_PRIO_##class
 156
 157__u8 ip_tos2prio[16] = {
 158        TC_PRIO_BESTEFFORT,
 159        ECN_OR_COST(FILLER),
 160        TC_PRIO_BESTEFFORT,
 161        ECN_OR_COST(BESTEFFORT),
 162        TC_PRIO_BULK,
 163        ECN_OR_COST(BULK),
 164        TC_PRIO_BULK,
 165        ECN_OR_COST(BULK),
 166        TC_PRIO_INTERACTIVE,
 167        ECN_OR_COST(INTERACTIVE),
 168        TC_PRIO_INTERACTIVE,
 169        ECN_OR_COST(INTERACTIVE),
 170        TC_PRIO_INTERACTIVE_BULK,
 171        ECN_OR_COST(INTERACTIVE_BULK),
 172        TC_PRIO_INTERACTIVE_BULK,
 173        ECN_OR_COST(INTERACTIVE_BULK)
 174};
 175
 176
 177/*
 178 * Route cache.
 179 */
 180
 181/* The locking scheme is rather straight forward:
 182 *
 183 * 1) A BH protected rwlocks protect buckets of the central route hash.
 184 * 2) Only writers remove entries, and they hold the lock
 185 *    as they look at rtable reference counts.
 186 * 3) Only readers acquire references to rtable entries,
 187 *    they do so with atomic increments and with the
 188 *    lock held.
 189 */
 190
 191struct rt_hash_bucket {
 192        struct rtable   *chain;
 193        rwlock_t        lock;
 194} __attribute__((__aligned__(8)));
 195
 196static struct rt_hash_bucket    *rt_hash_table;
 197static unsigned                 rt_hash_mask;
 198static int                      rt_hash_log;
 199static unsigned int             rt_hash_rnd;
 200
 201struct rt_cache_stat rt_cache_stat[NR_CPUS];
 202
 203static int rt_intern_hash(unsigned hash, struct rtable *rth,
 204                                struct rtable **res);
 205
 206static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
 207{
 208        return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
 209                & rt_hash_mask);
 210}
 211
 212static int rt_cache_get_info(char *buffer, char **start, off_t offset,
 213                                int length)
 214{
 215        int len = 0;
 216        off_t pos = 128;
 217        char temp[256];
 218        struct rtable *r;
 219        int i;
 220
 221        if (offset < 128) {
 222                sprintf(buffer, "%-127s\n",
 223                        "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 224                        "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 225                        "HHUptod\tSpecDst");
 226                len = 128;
 227        }
 228        
 229        for (i = rt_hash_mask; i >= 0; i--) {
 230                read_lock_bh(&rt_hash_table[i].lock);
 231                for (r = rt_hash_table[i].chain; r; r = r->u.rt_next) {
 232                        /*
 233                         *      Spin through entries until we are ready
 234                         */
 235                        pos += 128;
 236
 237                        if (pos <= offset) {
 238                                len = 0;
 239                                continue;
 240                        }
 241                        sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 242                                "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 243                                r->u.dst.dev ? r->u.dst.dev->name : "*",
 244                                (unsigned long)r->rt_dst,
 245                                (unsigned long)r->rt_gateway,
 246                                r->rt_flags,
 247                                atomic_read(&r->u.dst.__refcnt),
 248                                r->u.dst.__use,
 249                                0,
 250                                (unsigned long)r->rt_src,
 251                                (r->u.dst.advmss ?
 252                                 (int) r->u.dst.advmss + 40 : 0),
 253                                r->u.dst.window,
 254                                (int)((r->u.dst.rtt >> 3) + r->u.dst.rttvar),
 255                                r->key.tos,
 256                                r->u.dst.hh ?
 257                                        atomic_read(&r->u.dst.hh->hh_refcnt) :
 258                                        -1,
 259                                r->u.dst.hh ?
 260                                        (r->u.dst.hh->hh_output ==
 261                                         dev_queue_xmit) : 0,
 262                                r->rt_spec_dst);
 263                        sprintf(buffer + len, "%-127s\n", temp);
 264                        len += 128;
 265                        if (pos >= offset+length) {
 266                                read_unlock_bh(&rt_hash_table[i].lock);
 267                                goto done;
 268                        }
 269                }
 270                read_unlock_bh(&rt_hash_table[i].lock);
 271        }
 272
 273done:
 274        *start = buffer + len - (pos - offset);
 275        len = pos - offset;
 276        if (len > length)
 277                len = length;
 278        return len;
 279}
 280
 281static int rt_cache_stat_get_info(char *buffer, char **start, off_t offset, int length)
 282{
 283        unsigned int dst_entries = atomic_read(&ipv4_dst_ops.entries);
 284        int i, lcpu;
 285        int len = 0;
 286
 287        len += sprintf(buffer+len, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 288        for (lcpu = 0; lcpu < smp_num_cpus; lcpu++) {
 289                i = cpu_logical_map(lcpu);
 290
 291                len += sprintf(buffer+len, "%08x  %08x %08x %08x %08x %08x %08x %08x  %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 292                               dst_entries,                    
 293                               rt_cache_stat[i].in_hit,
 294                               rt_cache_stat[i].in_slow_tot,
 295                               rt_cache_stat[i].in_slow_mc,
 296                               rt_cache_stat[i].in_no_route,
 297                               rt_cache_stat[i].in_brd,
 298                               rt_cache_stat[i].in_martian_dst,
 299                               rt_cache_stat[i].in_martian_src,
 300
 301                               rt_cache_stat[i].out_hit,
 302                               rt_cache_stat[i].out_slow_tot,
 303                               rt_cache_stat[i].out_slow_mc, 
 304
 305                               rt_cache_stat[i].gc_total,
 306                               rt_cache_stat[i].gc_ignored,
 307                               rt_cache_stat[i].gc_goal_miss,
 308                               rt_cache_stat[i].gc_dst_overflow,
 309                               rt_cache_stat[i].in_hlist_search,
 310                               rt_cache_stat[i].out_hlist_search
 311
 312                        );
 313        }
 314        len -= offset;
 315
 316        if (len > length)
 317                len = length;
 318        if (len < 0)
 319                len = 0;
 320
 321        *start = buffer + offset;
 322        return len;
 323}
 324  
 325static __inline__ void rt_free(struct rtable *rt)
 326{
 327        dst_free(&rt->u.dst);
 328}
 329
 330static __inline__ void rt_drop(struct rtable *rt)
 331{
 332        ip_rt_put(rt);
 333        dst_free(&rt->u.dst);
 334}
 335
 336static __inline__ int rt_fast_clean(struct rtable *rth)
 337{
 338        /* Kill broadcast/multicast entries very aggresively, if they
 339           collide in hash table with more useful entries */
 340        return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 341                rth->key.iif && rth->u.rt_next;
 342}
 343
 344static __inline__ int rt_valuable(struct rtable *rth)
 345{
 346        return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 347                rth->u.dst.expires;
 348}
 349
 350static __inline__ int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 351{
 352        unsigned long age;
 353        int ret = 0;
 354
 355        if (atomic_read(&rth->u.dst.__refcnt))
 356                goto out;
 357
 358        ret = 1;
 359        if (rth->u.dst.expires &&
 360            time_after_eq(jiffies, rth->u.dst.expires))
 361                goto out;
 362
 363        age = jiffies - rth->u.dst.lastuse;
 364        ret = 0;
 365        if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 366            (age <= tmo2 && rt_valuable(rth)))
 367                goto out;
 368        ret = 1;
 369out:    return ret;
 370}
 371
 372/* Bits of score are:
 373 * 31: very valuable
 374 * 30: not quite useless
 375 * 29..0: usage counter
 376 */
 377static inline u32 rt_score(struct rtable *rt)
 378{
 379        u32 score = jiffies - rt->u.dst.lastuse;
 380
 381        score = ~score & ~(3<<30);
 382
 383        if (rt_valuable(rt))
 384                score |= (1<<31);
 385
 386        if (!rt->key.iif ||
 387            !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 388                score |= (1<<30);
 389
 390        return score;
 391}
 392
 393/* This runs via a timer and thus is always in BH context. */
 394static void SMP_TIMER_NAME(rt_check_expire)(unsigned long dummy)
 395{
 396        static int rover;
 397        int i = rover, t;
 398        struct rtable *rth, **rthp;
 399        unsigned long now = jiffies;
 400
 401        for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
 402             t -= ip_rt_gc_timeout) {
 403                unsigned long tmo = ip_rt_gc_timeout;
 404
 405                i = (i + 1) & rt_hash_mask;
 406                rthp = &rt_hash_table[i].chain;
 407
 408                write_lock(&rt_hash_table[i].lock);
 409                while ((rth = *rthp) != NULL) {
 410                        if (rth->u.dst.expires) {
 411                                /* Entry is expired even if it is in use */
 412                                if (time_before_eq(now, rth->u.dst.expires)) {
 413                                        tmo >>= 1;
 414                                        rthp = &rth->u.rt_next;
 415                                        continue;
 416                                }
 417                        } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 418                                tmo >>= 1;
 419                                rthp = &rth->u.rt_next;
 420                                continue;
 421                        }
 422
 423                        /* Cleanup aged off entries. */
 424                        *rthp = rth->u.rt_next;
 425                        rt_free(rth);
 426                }
 427                write_unlock(&rt_hash_table[i].lock);
 428
 429                /* Fallback loop breaker. */
 430                if (time_after(jiffies, now))
 431                        break;
 432        }
 433        rover = i;
 434        mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
 435}
 436
 437SMP_TIMER_DEFINE(rt_check_expire, rt_gc_task);
 438
 439/* This can run from both BH and non-BH contexts, the latter
 440 * in the case of a forced flush event.
 441 */
 442static void SMP_TIMER_NAME(rt_run_flush)(unsigned long dummy)
 443{
 444        int i;
 445        struct rtable *rth, *next;
 446
 447        rt_deadline = 0;
 448
 449        get_random_bytes(&rt_hash_rnd, 4);
 450
 451        for (i = rt_hash_mask; i >= 0; i--) {
 452                write_lock_bh(&rt_hash_table[i].lock);
 453                rth = rt_hash_table[i].chain;
 454                if (rth)
 455                        rt_hash_table[i].chain = NULL;
 456                write_unlock_bh(&rt_hash_table[i].lock);
 457
 458                for (; rth; rth = next) {
 459                        next = rth->u.rt_next;
 460                        rt_free(rth);
 461                }
 462        }
 463}
 464
 465SMP_TIMER_DEFINE(rt_run_flush, rt_cache_flush_task);
 466  
 467static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
 468
 469void rt_cache_flush(int delay)
 470{
 471        unsigned long now = jiffies;
 472        int user_mode = !in_softirq();
 473
 474        if (delay < 0)
 475                delay = ip_rt_min_delay;
 476
 477        spin_lock_bh(&rt_flush_lock);
 478
 479        if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
 480                long tmo = (long)(rt_deadline - now);
 481
 482                /* If flush timer is already running
 483                   and flush request is not immediate (delay > 0):
 484
 485                   if deadline is not achieved, prolongate timer to "delay",
 486                   otherwise fire it at deadline time.
 487                 */
 488
 489                if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
 490                        tmo = 0;
 491                
 492                if (delay > tmo)
 493                        delay = tmo;
 494        }
 495
 496        if (delay <= 0) {
 497                spin_unlock_bh(&rt_flush_lock);
 498                SMP_TIMER_NAME(rt_run_flush)(0);
 499                return;
 500        }
 501
 502        if (rt_deadline == 0)
 503                rt_deadline = now + ip_rt_max_delay;
 504
 505        mod_timer(&rt_flush_timer, now+delay);
 506        spin_unlock_bh(&rt_flush_lock);
 507}
 508
 509static void rt_secret_rebuild(unsigned long dummy)
 510{
 511        unsigned long now = jiffies;
 512
 513        rt_cache_flush(0);
 514        mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
 515}
 516
 517/*
 518   Short description of GC goals.
 519
 520   We want to build algorithm, which will keep routing cache
 521   at some equilibrium point, when number of aged off entries
 522   is kept approximately equal to newly generated ones.
 523
 524   Current expiration strength is variable "expire".
 525   We try to adjust it dynamically, so that if networking
 526   is idle expires is large enough to keep enough of warm entries,
 527   and when load increases it reduces to limit cache size.
 528 */
 529
 530static int rt_garbage_collect(void)
 531{
 532        static unsigned long expire = RT_GC_TIMEOUT;
 533        static unsigned long last_gc;
 534        static int rover;
 535        static int equilibrium;
 536        struct rtable *rth, **rthp;
 537        unsigned long now = jiffies;
 538        int goal;
 539
 540        /*
 541         * Garbage collection is pretty expensive,
 542         * do not make it too frequently.
 543         */
 544
 545        rt_cache_stat[smp_processor_id()].gc_total++;
 546
 547        if (now - last_gc < ip_rt_gc_min_interval &&
 548            atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 549                rt_cache_stat[smp_processor_id()].gc_ignored++;
 550                goto out;
 551        }
 552
 553        /* Calculate number of entries, which we want to expire now. */
 554        goal = atomic_read(&ipv4_dst_ops.entries) -
 555                (ip_rt_gc_elasticity << rt_hash_log);
 556        if (goal <= 0) {
 557                if (equilibrium < ipv4_dst_ops.gc_thresh)
 558                        equilibrium = ipv4_dst_ops.gc_thresh;
 559                goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 560                if (goal > 0) {
 561                        equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
 562                        goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 563                }
 564        } else {
 565                /* We are in dangerous area. Try to reduce cache really
 566                 * aggressively.
 567                 */
 568                goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
 569                equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 570        }
 571
 572        if (now - last_gc >= ip_rt_gc_min_interval)
 573                last_gc = now;
 574
 575        if (goal <= 0) {
 576                equilibrium += goal;
 577                goto work_done;
 578        }
 579
 580        do {
 581                int i, k;
 582
 583                for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 584                        unsigned long tmo = expire;
 585
 586                        k = (k + 1) & rt_hash_mask;
 587                        rthp = &rt_hash_table[k].chain;
 588                        write_lock_bh(&rt_hash_table[k].lock);
 589                        while ((rth = *rthp) != NULL) {
 590                                if (!rt_may_expire(rth, tmo, expire)) {
 591                                        tmo >>= 1;
 592                                        rthp = &rth->u.rt_next;
 593                                        continue;
 594                                }
 595                                *rthp = rth->u.rt_next;
 596                                rt_free(rth);
 597                                goal--;
 598                        }
 599                        write_unlock_bh(&rt_hash_table[k].lock);
 600                        if (goal <= 0)
 601                                break;
 602                }
 603                rover = k;
 604
 605                if (goal <= 0)
 606                        goto work_done;
 607
 608                /* Goal is not achieved. We stop process if:
 609
 610                   - if expire reduced to zero. Otherwise, expire is halfed.
 611                   - if table is not full.
 612                   - if we are called from interrupt.
 613                   - jiffies check is just fallback/debug loop breaker.
 614                     We will not spin here for long time in any case.
 615                 */
 616
 617                rt_cache_stat[smp_processor_id()].gc_goal_miss++;
 618
 619                if (expire == 0)
 620                        break;
 621
 622                expire >>= 1;
 623#if RT_CACHE_DEBUG >= 2
 624                printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 625                                atomic_read(&ipv4_dst_ops.entries), goal, i);
 626#endif
 627
 628                if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 629                        goto out;
 630        } while (!in_softirq() && time_before_eq(jiffies, now));
 631
 632        if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 633                goto out;
 634        if (net_ratelimit())
 635                printk(KERN_WARNING "dst cache overflow\n");
 636        rt_cache_stat[smp_processor_id()].gc_dst_overflow++;
 637        return 1;
 638
 639work_done:
 640        expire += ip_rt_gc_min_interval;
 641        if (expire > ip_rt_gc_timeout ||
 642            atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 643                expire = ip_rt_gc_timeout;
 644#if RT_CACHE_DEBUG >= 2
 645        printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 646                        atomic_read(&ipv4_dst_ops.entries), goal, rover);
 647#endif
 648out:    return 0;
 649}
 650
 651static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 652{
 653        struct rtable   *rth, **rthp;
 654        unsigned long   now;
 655        struct rtable *cand, **candp;
 656        u32             min_score;
 657        int             chain_length;
 658        int attempts = !in_softirq();
 659
 660restart:
 661        chain_length = 0;
 662        min_score = ~(u32)0;
 663        cand = NULL;
 664        candp = NULL;
 665        now = jiffies;
 666
 667        rthp = &rt_hash_table[hash].chain;
 668
 669        write_lock_bh(&rt_hash_table[hash].lock);
 670        while ((rth = *rthp) != NULL) {
 671                if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
 672                        /* Put it first */
 673                        *rthp = rth->u.rt_next;
 674                        rth->u.rt_next = rt_hash_table[hash].chain;
 675                        rt_hash_table[hash].chain = rth;
 676
 677                        rth->u.dst.__use++;
 678                        dst_hold(&rth->u.dst);
 679                        rth->u.dst.lastuse = now;
 680                        write_unlock_bh(&rt_hash_table[hash].lock);
 681
 682                        rt_drop(rt);
 683                        *rp = rth;
 684                        return 0;
 685                }
 686
 687                if (!atomic_read(&rth->u.dst.__refcnt)) {
 688                        u32 score = rt_score(rth);
 689
 690                        if (score <= min_score) {
 691                                cand = rth;
 692                                candp = rthp;
 693                                min_score = score;
 694                        }
 695                }
 696
 697                chain_length++;
 698
 699                rthp = &rth->u.rt_next;
 700        }
 701
 702        if (cand) {
 703                /* ip_rt_gc_elasticity used to be average length of chain
 704                 * length, when exceeded gc becomes really aggressive.
 705                 *
 706                 * The second limit is less certain. At the moment it allows
 707                 * only 2 entries per bucket. We will see.
 708                 */
 709                if (chain_length > ip_rt_gc_elasticity) {
 710                        *candp = cand->u.rt_next;
 711                        rt_free(cand);
 712                }
 713        }
 714
 715        /* Try to bind route to arp only if it is output
 716           route or unicast forwarding path.
 717         */
 718        if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
 719                int err = arp_bind_neighbour(&rt->u.dst);
 720                if (err) {
 721                        write_unlock_bh(&rt_hash_table[hash].lock);
 722
 723                        if (err != -ENOBUFS) {
 724                                rt_drop(rt);
 725                                return err;
 726                        }
 727
 728                        /* Neighbour tables are full and nothing
 729                           can be released. Try to shrink route cache,
 730                           it is most likely it holds some neighbour records.
 731                         */
 732                        if (attempts-- > 0) {
 733                                int saved_elasticity = ip_rt_gc_elasticity;
 734                                int saved_int = ip_rt_gc_min_interval;
 735                                ip_rt_gc_elasticity     = 1;
 736                                ip_rt_gc_min_interval   = 0;
 737                                rt_garbage_collect();
 738                                ip_rt_gc_min_interval   = saved_int;
 739                                ip_rt_gc_elasticity     = saved_elasticity;
 740                                goto restart;
 741                        }
 742
 743                        if (net_ratelimit())
 744                                printk(KERN_WARNING "Neighbour table overflow.\n");
 745                        rt_drop(rt);
 746                        return -ENOBUFS;
 747                }
 748        }
 749
 750        rt->u.rt_next = rt_hash_table[hash].chain;
 751#if RT_CACHE_DEBUG >= 2
 752        if (rt->u.rt_next) {
 753                struct rtable *trt;
 754                printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
 755                       NIPQUAD(rt->rt_dst));
 756                for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
 757                        printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
 758                printk("\n");
 759        }
 760#endif
 761        rt_hash_table[hash].chain = rt;
 762        write_unlock_bh(&rt_hash_table[hash].lock);
 763        *rp = rt;
 764        return 0;
 765}
 766
 767void rt_bind_peer(struct rtable *rt, int create)
 768{
 769        static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED;
 770        struct inet_peer *peer;
 771
 772        peer = inet_getpeer(rt->rt_dst, create);
 773
 774        spin_lock_bh(&rt_peer_lock);
 775        if (rt->peer == NULL) {
 776                rt->peer = peer;
 777                peer = NULL;
 778        }
 779        spin_unlock_bh(&rt_peer_lock);
 780        if (peer)
 781                inet_putpeer(peer);
 782}
 783
 784/*
 785 * Peer allocation may fail only in serious out-of-memory conditions.  However
 786 * we still can generate some output.
 787 * Random ID selection looks a bit dangerous because we have no chances to
 788 * select ID being unique in a reasonable period of time.
 789 * But broken packet identifier may be better than no packet at all.
 790 */
 791static void ip_select_fb_ident(struct iphdr *iph)
 792{
 793        static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED;
 794        static u32 ip_fallback_id;
 795        u32 salt;
 796
 797        spin_lock_bh(&ip_fb_id_lock);
 798        salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
 799        iph->id = htons(salt & 0xFFFF);
 800        ip_fallback_id = salt;
 801        spin_unlock_bh(&ip_fb_id_lock);
 802}
 803
 804void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst)
 805{
 806        struct rtable *rt = (struct rtable *) dst;
 807
 808        if (rt) {
 809                if (rt->peer == NULL)
 810                        rt_bind_peer(rt, 1);
 811
 812                /* If peer is attached to destination, it is never detached,
 813                   so that we need not to grab a lock to dereference it.
 814                 */
 815                if (rt->peer) {
 816                        iph->id = htons(inet_getid(rt->peer));
 817                        return;
 818                }
 819        } else
 820                printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
 821
 822        ip_select_fb_ident(iph);
 823}
 824
 825static void rt_del(unsigned hash, struct rtable *rt)
 826{
 827        struct rtable **rthp;
 828
 829        write_lock_bh(&rt_hash_table[hash].lock);
 830        ip_rt_put(rt);
 831        for (rthp = &rt_hash_table[hash].chain; *rthp;
 832             rthp = &(*rthp)->u.rt_next)
 833                if (*rthp == rt) {
 834                        *rthp = rt->u.rt_next;
 835                        rt_free(rt);
 836                        break;
 837                }
 838        write_unlock_bh(&rt_hash_table[hash].lock);
 839}
 840
 841void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
 842                    u32 saddr, u8 tos, struct net_device *dev)
 843{
 844        int i, k;
 845        struct in_device *in_dev = in_dev_get(dev);
 846        struct rtable *rth, **rthp;
 847        u32  skeys[2] = { saddr, 0 };
 848        int  ikeys[2] = { dev->ifindex, 0 };
 849
 850        tos &= IPTOS_RT_MASK;
 851
 852        if (!in_dev)
 853                return;
 854
 855        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
 856            || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
 857                goto reject_redirect;
 858
 859        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 860                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 861                        goto reject_redirect;
 862                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 863                        goto reject_redirect;
 864        } else {
 865                if (inet_addr_type(new_gw) != RTN_UNICAST)
 866                        goto reject_redirect;
 867        }
 868
 869        for (i = 0; i < 2; i++) {
 870                for (k = 0; k < 2; k++) {
 871                        unsigned hash = rt_hash_code(daddr,
 872                                                     skeys[i] ^ (ikeys[k] << 5),
 873                                                     tos);
 874
 875                        rthp=&rt_hash_table[hash].chain;
 876
 877                        read_lock(&rt_hash_table[hash].lock);
 878                        while ((rth = *rthp) != NULL) {
 879                                struct rtable *rt;
 880
 881                                if (rth->key.dst != daddr ||
 882                                    rth->key.src != skeys[i] ||
 883                                    rth->key.tos != tos ||
 884                                    rth->key.oif != ikeys[k] ||
 885                                    rth->key.iif != 0) {
 886                                        rthp = &rth->u.rt_next;
 887                                        continue;
 888                                }
 889
 890                                if (rth->rt_dst != daddr ||
 891                                    rth->rt_src != saddr ||
 892                                    rth->u.dst.error ||
 893                                    rth->rt_gateway != old_gw ||
 894                                    rth->u.dst.dev != dev)
 895                                        break;
 896
 897                                dst_hold(&rth->u.dst);
 898                                read_unlock(&rt_hash_table[hash].lock);
 899
 900                                rt = dst_alloc(&ipv4_dst_ops);
 901                                if (rt == NULL) {
 902                                        ip_rt_put(rth);
 903                                        in_dev_put(in_dev);
 904                                        return;
 905                                }
 906
 907                                /* Copy all the information. */
 908                                *rt = *rth;
 909                                rt->u.dst.__use         = 1;
 910                                atomic_set(&rt->u.dst.__refcnt, 1);
 911                                if (rt->u.dst.dev)
 912                                        dev_hold(rt->u.dst.dev);
 913                                rt->u.dst.lastuse       = jiffies;
 914                                rt->u.dst.neighbour     = NULL;
 915                                rt->u.dst.hh            = NULL;
 916                                rt->u.dst.obsolete      = 0;
 917
 918                                rt->rt_flags            |= RTCF_REDIRECTED;
 919
 920                                /* Gateway is different ... */
 921                                rt->rt_gateway          = new_gw;
 922
 923                                /* Redirect received -> path was valid */
 924                                dst_confirm(&rth->u.dst);
 925
 926                                if (rt->peer)
 927                                        atomic_inc(&rt->peer->refcnt);
 928
 929                                if (arp_bind_neighbour(&rt->u.dst) ||
 930                                    !(rt->u.dst.neighbour->nud_state &
 931                                            NUD_VALID)) {
 932                                        if (rt->u.dst.neighbour)
 933                                                neigh_event_send(rt->u.dst.neighbour, NULL);
 934                                        ip_rt_put(rth);
 935                                        rt_drop(rt);
 936                                        goto do_next;
 937                                }
 938
 939                                rt_del(hash, rth);
 940                                if (!rt_intern_hash(hash, rt, &rt))
 941                                        ip_rt_put(rt);
 942                                goto do_next;
 943                        }
 944                        read_unlock(&rt_hash_table[hash].lock);
 945                do_next:
 946                        ;
 947                }
 948        }
 949        in_dev_put(in_dev);
 950        return;
 951
 952reject_redirect:
 953#ifdef CONFIG_IP_ROUTE_VERBOSE
 954        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
 955                printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
 956                        "%u.%u.%u.%u ignored.\n"
 957                        "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
 958                        "tos %02x\n",
 959                       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
 960                       NIPQUAD(saddr), NIPQUAD(daddr), tos);
 961#endif
 962        in_dev_put(in_dev);
 963}
 964
 965static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 966{
 967        struct rtable *rt = (struct rtable*)dst;
 968        struct dst_entry *ret = dst;
 969
 970        if (rt) {
 971                if (dst->obsolete) {
 972                        ip_rt_put(rt);
 973                        ret = NULL;
 974                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 975                           rt->u.dst.expires) {
 976                        unsigned hash = rt_hash_code(rt->key.dst,
 977                                                     rt->key.src ^
 978                                                        (rt->key.oif << 5),
 979                                                     rt->key.tos);
 980#if RT_CACHE_DEBUG >= 1
 981                        printk(KERN_DEBUG "ip_rt_advice: redirect to "
 982                                          "%u.%u.%u.%u/%02x dropped\n",
 983                                NIPQUAD(rt->rt_dst), rt->key.tos);
 984#endif
 985                        rt_del(hash, rt);
 986                        ret = NULL;
 987                }
 988        }
 989        return ret;
 990}
 991
 992/*
 993 * Algorithm:
 994 *      1. The first ip_rt_redirect_number redirects are sent
 995 *         with exponential backoff, then we stop sending them at all,
 996 *         assuming that the host ignores our redirects.
 997 *      2. If we did not see packets requiring redirects
 998 *         during ip_rt_redirect_silence, we assume that the host
 999 *         forgot redirected route and start to send redirects again.
1000 *
1001 * This algorithm is much cheaper and more intelligent than dumb load limiting
1002 * in icmp.c.
1003 *
1004 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1005 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1006 */
1007
1008void ip_rt_send_redirect(struct sk_buff *skb)
1009{
1010        struct rtable *rt = (struct rtable*)skb->dst;
1011        struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1012
1013        if (!in_dev)
1014                return;
1015
1016        if (!IN_DEV_TX_REDIRECTS(in_dev))
1017                goto out;
1018
1019        /* No redirected packets during ip_rt_redirect_silence;
1020         * reset the algorithm.
1021         */
1022        if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1023                rt->u.dst.rate_tokens = 0;
1024
1025        /* Too many ignored redirects; do not send anything
1026         * set u.dst.rate_last to the last seen redirected packet.
1027         */
1028        if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1029                rt->u.dst.rate_last = jiffies;
1030                goto out;
1031        }
1032
1033        /* Check for load limit; set rate_last to the latest sent
1034         * redirect.
1035         */
1036        if (time_after(jiffies,
1037                       (rt->u.dst.rate_last +
1038                        (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1039                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1040                rt->u.dst.rate_last = jiffies;
1041                ++rt->u.dst.rate_tokens;
1042#ifdef CONFIG_IP_ROUTE_VERBOSE
1043                if (IN_DEV_LOG_MARTIANS(in_dev) &&
1044                    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1045                    net_ratelimit())
1046                        printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1047                                "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1048                                NIPQUAD(rt->rt_src), rt->rt_iif,
1049                                NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1050#endif
1051        }
1052out:
1053        in_dev_put(in_dev);
1054}
1055
1056static int ip_error(struct sk_buff *skb)
1057{
1058        struct rtable *rt = (struct rtable*)skb->dst;
1059        unsigned long now;
1060        int code;
1061
1062        switch (rt->u.dst.error) {
1063                case EINVAL:
1064                default:
1065                        goto out;
1066                case EHOSTUNREACH:
1067                        code = ICMP_HOST_UNREACH;
1068                        break;
1069                case ENETUNREACH:
1070                        code = ICMP_NET_UNREACH;
1071                        break;
1072                case EACCES:
1073                        code = ICMP_PKT_FILTERED;
1074                        break;
1075        }
1076
1077        now = jiffies;
1078        rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1079        if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1080                rt->u.dst.rate_tokens = ip_rt_error_burst;
1081        rt->u.dst.rate_last = now;
1082        if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1083                rt->u.dst.rate_tokens -= ip_rt_error_cost;
1084                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1085        }
1086
1087out:    kfree_skb(skb);
1088        return 0;
1089} 
1090
1091/*
1092 *      The last two values are not from the RFC but
1093 *      are needed for AMPRnet AX.25 paths.
1094 */
1095
1096static unsigned short mtu_plateau[] =
1097{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1098
1099static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1100{
1101        int i;
1102        
1103        for (i = 0; i < sizeof(mtu_plateau) / sizeof(mtu_plateau[0]); i++)
1104                if (old_mtu > mtu_plateau[i])
1105                        return mtu_plateau[i];
1106        return 68;
1107}
1108
1109unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1110{
1111        int i;
1112        unsigned short old_mtu = ntohs(iph->tot_len);
1113        struct rtable *rth;
1114        u32  skeys[2] = { iph->saddr, 0, };
1115        u32  daddr = iph->daddr;
1116        u8   tos = iph->tos & IPTOS_RT_MASK;
1117        unsigned short est_mtu = 0;
1118
1119        if (ipv4_config.no_pmtu_disc)
1120                return 0;
1121
1122        for (i = 0; i < 2; i++) {
1123                unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1124
1125                read_lock(&rt_hash_table[hash].lock);
1126                for (rth = rt_hash_table[hash].chain; rth;
1127                     rth = rth->u.rt_next) {
1128                        if (rth->key.dst == daddr &&
1129                            rth->key.src == skeys[i] &&
1130                            rth->rt_dst  == daddr &&
1131                            rth->rt_src  == iph->saddr &&
1132                            rth->key.tos == tos &&
1133                            rth->key.iif == 0 &&
1134                            !(rth->u.dst.mxlock & (1 << RTAX_MTU))) {
1135                                unsigned short mtu = new_mtu;
1136
1137                                if (new_mtu < 68 || new_mtu >= old_mtu) {
1138
1139                                        /* BSD 4.2 compatibility hack :-( */
1140                                        if (mtu == 0 &&
1141                                            old_mtu >= rth->u.dst.pmtu &&
1142                                            old_mtu >= 68 + (iph->ihl << 2))
1143                                                old_mtu -= iph->ihl << 2;
1144
1145                                        mtu = guess_mtu(old_mtu);
1146                                }
1147                                if (mtu <= rth->u.dst.pmtu) {
1148                                        if (mtu < rth->u.dst.pmtu) { 
1149                                                dst_confirm(&rth->u.dst);
1150                                                if (mtu < ip_rt_min_pmtu) {
1151                                                        mtu = ip_rt_min_pmtu;
1152                                                        rth->u.dst.mxlock |=
1153                                                                (1 << RTAX_MTU);
1154                                                }
1155                                                rth->u.dst.pmtu = mtu;
1156                                                dst_set_expires(&rth->u.dst,
1157                                                        ip_rt_mtu_expires);
1158                                        }
1159                                        est_mtu = mtu;
1160                                }
1161                        }
1162                }
1163                read_unlock(&rt_hash_table[hash].lock);
1164        }
1165        return est_mtu ? : new_mtu;
1166}
1167
1168void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu)
1169{
1170        if (dst->pmtu > mtu && mtu >= 68 &&
1171            !(dst->mxlock & (1 << RTAX_MTU))) {
1172                if (mtu < ip_rt_min_pmtu) {
1173                        mtu = ip_rt_min_pmtu;
1174                        dst->mxlock |= (1 << RTAX_MTU);
1175                }
1176                dst->pmtu = mtu;
1177                dst_set_expires(dst, ip_rt_mtu_expires);
1178        }
1179}
1180
1181static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1182{
1183        dst_release(dst);
1184        return NULL;
1185}
1186
1187static struct dst_entry *ipv4_dst_reroute(struct dst_entry *dst,
1188                                          struct sk_buff *skb)
1189{
1190        return NULL;
1191}
1192
1193static void ipv4_dst_destroy(struct dst_entry *dst)
1194{
1195        struct rtable *rt = (struct rtable *) dst;
1196        struct inet_peer *peer = rt->peer;
1197
1198        if (peer) {
1199                rt->peer = NULL;
1200                inet_putpeer(peer);
1201        }
1202}
1203
1204static void ipv4_link_failure(struct sk_buff *skb)
1205{
1206        struct rtable *rt;
1207
1208        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1209
1210        rt = (struct rtable *) skb->dst;
1211        if (rt)
1212                dst_set_expires(&rt->u.dst, 0);
1213}
1214
1215static int ip_rt_bug(struct sk_buff *skb)
1216{
1217        printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1218                NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1219                skb->dev ? skb->dev->name : "?");
1220        kfree_skb(skb);
1221        return 0;
1222}
1223
1224/*
1225   We do not cache source address of outgoing interface,
1226   because it is used only by IP RR, TS and SRR options,
1227   so that it out of fast path.
1228
1229   BTW remember: "addr" is allowed to be not aligned
1230   in IP options!
1231 */
1232
1233void ip_rt_get_source(u8 *addr, struct rtable *rt)
1234{
1235        u32 src;
1236        struct fib_result res;
1237
1238        if (rt->key.iif == 0)
1239                src = rt->rt_src;
1240        else if (fib_lookup(&rt->key, &res) == 0) {
1241#ifdef CONFIG_IP_ROUTE_NAT
1242                if (res.type == RTN_NAT)
1243                        src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1244                                                RT_SCOPE_UNIVERSE);
1245                else
1246#endif
1247                        src = FIB_RES_PREFSRC(res);
1248                fib_res_put(&res);
1249        } else
1250                src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1251                                        RT_SCOPE_UNIVERSE);
1252        memcpy(addr, &src, 4);
1253}
1254
1255#ifdef CONFIG_NET_CLS_ROUTE
1256static void set_class_tag(struct rtable *rt, u32 tag)
1257{
1258        if (!(rt->u.dst.tclassid & 0xFFFF))
1259                rt->u.dst.tclassid |= tag & 0xFFFF;
1260        if (!(rt->u.dst.tclassid & 0xFFFF0000))
1261                rt->u.dst.tclassid |= tag & 0xFFFF0000;
1262}
1263#endif
1264
1265static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1266{
1267        struct fib_info *fi = res->fi;
1268
1269        if (fi) {
1270                if (FIB_RES_GW(*res) &&
1271                    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1272                        rt->rt_gateway = FIB_RES_GW(*res);
1273                memcpy(&rt->u.dst.mxlock, fi->fib_metrics,
1274                        sizeof(fi->fib_metrics));
1275                if (fi->fib_mtu == 0) {
1276                        rt->u.dst.pmtu = rt->u.dst.dev->mtu;
1277                        if (rt->u.dst.mxlock & (1 << RTAX_MTU) &&
1278                            rt->rt_gateway != rt->rt_dst &&
1279                            rt->u.dst.pmtu > 576)
1280                                rt->u.dst.pmtu = 576;
1281                }
1282#ifdef CONFIG_NET_CLS_ROUTE
1283                rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1284#endif
1285        } else
1286                rt->u.dst.pmtu  = rt->u.dst.dev->mtu;
1287
1288        if (rt->u.dst.pmtu > IP_MAX_MTU)
1289                rt->u.dst.pmtu = IP_MAX_MTU;
1290        if (rt->u.dst.advmss == 0)
1291                rt->u.dst.advmss = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1292                                       ip_rt_min_advmss);
1293        if (rt->u.dst.advmss > 65535 - 40)
1294                rt->u.dst.advmss = 65535 - 40;
1295
1296#ifdef CONFIG_NET_CLS_ROUTE
1297#ifdef CONFIG_IP_MULTIPLE_TABLES
1298        set_class_tag(rt, fib_rules_tclass(res));
1299#endif
1300        set_class_tag(rt, itag);
1301#endif
1302        rt->rt_type = res->type;
1303}
1304
1305static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1306                                u8 tos, struct net_device *dev, int our)
1307{
1308        unsigned hash;
1309        struct rtable *rth;
1310        u32 spec_dst;
1311        struct in_device *in_dev = in_dev_get(dev);
1312        u32 itag = 0;
1313
1314        /* Primary sanity checks. */
1315
1316        if (in_dev == NULL)
1317                return -EINVAL;
1318
1319        if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1320            skb->protocol != htons(ETH_P_IP))
1321                goto e_inval;
1322
1323        if (ZERONET(saddr)) {
1324                if (!LOCAL_MCAST(daddr))
1325                        goto e_inval;
1326                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1327        } else if (fib_validate_source(saddr, 0, tos, 0,
1328                                        dev, &spec_dst, &itag) < 0)
1329                goto e_inval;
1330
1331        rth = dst_alloc(&ipv4_dst_ops);
1332        if (!rth)
1333                goto e_nobufs;
1334
1335        rth->u.dst.output= ip_rt_bug;
1336
1337        atomic_set(&rth->u.dst.__refcnt, 1);
1338        rth->u.dst.flags= DST_HOST;
1339        rth->key.dst    = daddr;
1340        rth->rt_dst     = daddr;
1341        rth->key.tos    = tos;
1342#ifdef CONFIG_IP_ROUTE_FWMARK
1343        rth->key.fwmark = skb->nfmark;
1344#endif
1345        rth->key.src    = saddr;
1346        rth->rt_src     = saddr;
1347#ifdef CONFIG_IP_ROUTE_NAT
1348        rth->rt_dst_map = daddr;
1349        rth->rt_src_map = saddr;
1350#endif
1351#ifdef CONFIG_NET_CLS_ROUTE
1352        rth->u.dst.tclassid = itag;
1353#endif
1354        rth->rt_iif     =
1355        rth->key.iif    = dev->ifindex;
1356        rth->u.dst.dev  = &loopback_dev;
1357        dev_hold(rth->u.dst.dev);
1358        rth->key.oif    = 0;
1359        rth->rt_gateway = daddr;
1360        rth->rt_spec_dst= spec_dst;
1361        rth->rt_type    = RTN_MULTICAST;
1362        rth->rt_flags   = RTCF_MULTICAST;
1363        if (our) {
1364                rth->u.dst.input= ip_local_deliver;
1365                rth->rt_flags |= RTCF_LOCAL;
1366        }
1367
1368#ifdef CONFIG_IP_MROUTE
1369        if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1370                rth->u.dst.input = ip_mr_input;
1371#endif
1372        rt_cache_stat[smp_processor_id()].in_slow_mc++;
1373
1374        in_dev_put(in_dev);
1375        hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1376        return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1377
1378e_nobufs:
1379        in_dev_put(in_dev);
1380        return -ENOBUFS;
1381
1382e_inval:
1383        in_dev_put(in_dev);
1384        return -EINVAL;
1385}
1386
1387/*
1388 *      NOTE. We drop all the packets that has local source
1389 *      addresses, because every properly looped back packet
1390 *      must have correct destination already attached by output routine.
1391 *
1392 *      Such approach solves two big problems:
1393 *      1. Not simplex devices are handled properly.
1394 *      2. IP spoofing attempts are filtered with 100% of guarantee.
1395 */
1396
1397int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1398                        u8 tos, struct net_device *dev)
1399{
1400        struct rt_key   key;
1401        struct fib_result res;
1402        struct in_device *in_dev = in_dev_get(dev);
1403        struct in_device *out_dev = NULL;
1404        unsigned        flags = 0;
1405        u32             itag = 0;
1406        struct rtable * rth;
1407        unsigned        hash;
1408        u32             spec_dst;
1409        int             err = -EINVAL;
1410        int             free_res = 0;
1411
1412        /* IP on this device is disabled. */
1413
1414        if (!in_dev)
1415                goto out;
1416
1417        key.dst         = daddr;
1418        key.src         = saddr;
1419        key.tos         = tos;
1420#ifdef CONFIG_IP_ROUTE_FWMARK
1421        key.fwmark      = skb->nfmark;
1422#endif
1423        key.iif         = dev->ifindex;
1424        key.oif         = 0;
1425        key.scope       = RT_SCOPE_UNIVERSE;
1426
1427        hash = rt_hash_code(daddr, saddr ^ (key.iif << 5), tos);
1428
1429        /* Check for the most weird martians, which can be not detected
1430           by fib_lookup.
1431         */
1432
1433        if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1434                goto martian_source;
1435
1436        if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1437                goto brd_input;
1438
1439        /* Accept zero addresses only to limited broadcast;
1440         * I even do not know to fix it or not. Waiting for complains :-)
1441         */
1442        if (ZERONET(saddr))
1443                goto martian_source;
1444
1445        if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1446                goto martian_destination;
1447
1448        /*
1449         *      Now we are ready to route packet.
1450         */
1451        if ((err = fib_lookup(&key, &res)) != 0) {
1452                if (!IN_DEV_FORWARD(in_dev))
1453                        goto e_inval;
1454                goto no_route;
1455        }
1456        free_res = 1;
1457
1458        rt_cache_stat[smp_processor_id()].in_slow_tot++;
1459
1460#ifdef CONFIG_IP_ROUTE_NAT
1461        /* Policy is applied before mapping destination,
1462           but rerouting after map should be made with old source.
1463         */
1464
1465        if (1) {
1466                u32 src_map = saddr;
1467                if (res.r)
1468                        src_map = fib_rules_policy(saddr, &res, &flags);
1469
1470                if (res.type == RTN_NAT) {
1471                        key.dst = fib_rules_map_destination(daddr, &res);
1472                        fib_res_put(&res);
1473                        free_res = 0;
1474                        if (fib_lookup(&key, &res))
1475                                goto e_inval;
1476                        free_res = 1;
1477                        if (res.type != RTN_UNICAST)
1478                                goto e_inval;
1479                        flags |= RTCF_DNAT;
1480                }
1481                key.src = src_map;
1482        }
1483#endif
1484
1485        if (res.type == RTN_BROADCAST)
1486                goto brd_input;
1487
1488        if (res.type == RTN_LOCAL) {
1489                int result;
1490                result = fib_validate_source(saddr, daddr, tos,
1491                                             loopback_dev.ifindex,
1492                                             dev, &spec_dst, &itag);
1493                if (result < 0)
1494                        goto martian_source;
1495                if (result)
1496                        flags |= RTCF_DIRECTSRC;
1497                spec_dst = daddr;
1498                goto local_input;
1499        }
1500
1501        if (!IN_DEV_FORWARD(in_dev))
1502                goto e_inval;
1503        if (res.type != RTN_UNICAST)
1504                goto martian_destination;
1505
1506#ifdef CONFIG_IP_ROUTE_MULTIPATH
1507        if (res.fi->fib_nhs > 1 && key.oif == 0)
1508                fib_select_multipath(&key, &res);
1509#endif
1510        out_dev = in_dev_get(FIB_RES_DEV(res));
1511        if (out_dev == NULL) {
1512                if (net_ratelimit())
1513                        printk(KERN_CRIT "Bug in ip_route_input_slow(). "
1514                                         "Please, report\n");
1515                goto e_inval;
1516        }
1517
1518        err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev,
1519                                  &spec_dst, &itag);
1520        if (err < 0)
1521                goto martian_source;
1522
1523        if (err)
1524                flags |= RTCF_DIRECTSRC;
1525
1526        if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1527            (IN_DEV_SHARED_MEDIA(out_dev) ||
1528             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1529                flags |= RTCF_DOREDIRECT;
1530
1531        if (skb->protocol != htons(ETH_P_IP)) {
1532                /* Not IP (i.e. ARP). Do not create route, if it is
1533                 * invalid for proxy arp. DNAT routes are always valid.
1534                 */
1535                if (out_dev == in_dev && !(flags & RTCF_DNAT))
1536                        goto e_inval;
1537        }
1538
1539        rth = dst_alloc(&ipv4_dst_ops);
1540        if (!rth)
1541                goto e_nobufs;
1542
1543        atomic_set(&rth->u.dst.__refcnt, 1);
1544        rth->u.dst.flags= DST_HOST;
1545        rth->key.dst    = daddr;
1546        rth->rt_dst     = daddr;
1547        rth->key.tos    = tos;
1548#ifdef CONFIG_IP_ROUTE_FWMARK
1549        rth->key.fwmark = skb->nfmark;
1550#endif
1551        rth->key.src    = saddr;
1552        rth->rt_src     = saddr;
1553        rth->rt_gateway = daddr;
1554#ifdef CONFIG_IP_ROUTE_NAT
1555        rth->rt_src_map = key.src;
1556        rth->rt_dst_map = key.dst;
1557        if (flags&RTCF_DNAT)
1558                rth->rt_gateway = key.dst;
1559#endif
1560        rth->rt_iif     =
1561        rth->key.iif    = dev->ifindex;
1562        rth->u.dst.dev  = out_dev->dev;
1563        dev_hold(rth->u.dst.dev);
1564        rth->key.oif    = 0;
1565        rth->rt_spec_dst= spec_dst;
1566
1567        rth->u.dst.input = ip_forward;
1568        rth->u.dst.output = ip_output;
1569
1570        rt_set_nexthop(rth, &res, itag);
1571
1572        rth->rt_flags = flags;
1573
1574#ifdef CONFIG_NET_FASTROUTE
1575        if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
1576                struct net_device *odev = rth->u.dst.dev;
1577                if (odev != dev &&
1578                    dev->accept_fastpath &&
1579                    odev->mtu >= dev->mtu &&
1580                    dev->accept_fastpath(dev, &rth->u.dst) == 0)
1581                        rth->rt_flags |= RTCF_FAST;
1582        }
1583#endif
1584
1585intern:
1586        err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1587done:
1588        in_dev_put(in_dev);
1589        if (out_dev)
1590                in_dev_put(out_dev);
1591        if (free_res)
1592                fib_res_put(&res);
1593out:    return err;
1594
1595brd_input:
1596        if (skb->protocol != htons(ETH_P_IP))
1597                goto e_inval;
1598
1599        if (ZERONET(saddr))
1600                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1601        else {
1602                err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1603                                          &itag);
1604                if (err < 0)
1605                        goto martian_source;
1606                if (err)
1607                        flags |= RTCF_DIRECTSRC;
1608        }
1609        flags |= RTCF_BROADCAST;
1610        res.type = RTN_BROADCAST;
1611        rt_cache_stat[smp_processor_id()].in_brd++;
1612
1613local_input:
1614        rth = dst_alloc(&ipv4_dst_ops);
1615        if (!rth)
1616                goto e_nobufs;
1617
1618        rth->u.dst.output= ip_rt_bug;
1619
1620        atomic_set(&rth->u.dst.__refcnt, 1);
1621        rth->u.dst.flags= DST_HOST;
1622        rth->key.dst    = daddr;
1623        rth->rt_dst     = daddr;
1624        rth->key.tos    = tos;
1625#ifdef CONFIG_IP_ROUTE_FWMARK
1626        rth->key.fwmark = skb->nfmark;
1627#endif
1628        rth->key.src    = saddr;
1629        rth->rt_src     = saddr;
1630#ifdef CONFIG_IP_ROUTE_NAT
1631        rth->rt_dst_map = key.dst;
1632        rth->rt_src_map = key.src;
1633#endif
1634#ifdef CONFIG_NET_CLS_ROUTE
1635        rth->u.dst.tclassid = itag;
1636#endif
1637        rth->rt_iif     =
1638        rth->key.iif    = dev->ifindex;
1639        rth->u.dst.dev  = &loopback_dev;
1640        dev_hold(rth->u.dst.dev);
1641        rth->key.oif    = 0;
1642        rth->rt_gateway = daddr;
1643        rth->rt_spec_dst= spec_dst;
1644        rth->u.dst.input= ip_local_deliver;
1645        rth->rt_flags   = flags|RTCF_LOCAL;
1646        if (res.type == RTN_UNREACHABLE) {
1647                rth->u.dst.input= ip_error;
1648                rth->u.dst.error= -err;
1649                rth->rt_flags   &= ~RTCF_LOCAL;
1650        }
1651        rth->rt_type    = res.type;
1652        goto intern;
1653
1654no_route:
1655        rt_cache_stat[smp_processor_id()].in_no_route++;
1656        spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1657        res.type = RTN_UNREACHABLE;
1658        goto local_input;
1659
1660        /*
1661         *      Do not cache martian addresses: they should be logged (RFC1812)
1662         */
1663martian_destination:
1664        rt_cache_stat[smp_processor_id()].in_martian_dst++;
1665#ifdef CONFIG_IP_ROUTE_VERBOSE
1666        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1667                printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1668                        "%u.%u.%u.%u, dev %s\n",
1669                        NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1670#endif
1671e_inval:
1672        err = -EINVAL;
1673        goto done;
1674
1675e_nobufs:
1676        err = -ENOBUFS;
1677        goto done;
1678
1679martian_source:
1680
1681        rt_cache_stat[smp_processor_id()].in_martian_src++;
1682#ifdef CONFIG_IP_ROUTE_VERBOSE
1683        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1684                /*
1685                 *      RFC1812 recommendation, if source is martian,
1686                 *      the only hint is MAC header.
1687                 */
1688                printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1689                        "%u.%u.%u.%u, on dev %s\n",
1690                        NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1691                if (dev->hard_header_len) {
1692                        int i;
1693                        unsigned char *p = skb->mac.raw;
1694                        printk(KERN_WARNING "ll header: ");
1695                        for (i = 0; i < dev->hard_header_len; i++, p++) {
1696                                printk("%02x", *p);
1697                                if (i < (dev->hard_header_len - 1))
1698                                        printk(":");
1699                        }
1700                        printk("\n");
1701                }
1702        }
1703#endif
1704        goto e_inval;
1705}
1706
1707int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1708                   u8 tos, struct net_device *dev)
1709{
1710        struct rtable * rth;
1711        unsigned        hash;
1712        int iif = dev->ifindex;
1713
1714        tos &= IPTOS_RT_MASK;
1715        hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
1716
1717        read_lock(&rt_hash_table[hash].lock);
1718        for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
1719                if (rth->key.dst == daddr &&
1720                    rth->key.src == saddr &&
1721                    rth->key.iif == iif &&
1722                    rth->key.oif == 0 &&
1723#ifdef CONFIG_IP_ROUTE_FWMARK
1724                    rth->key.fwmark == skb->nfmark &&
1725#endif
1726                    rth->key.tos == tos) {
1727                        rth->u.dst.lastuse = jiffies;
1728                        dst_hold(&rth->u.dst);
1729                        rth->u.dst.__use++;
1730                        rt_cache_stat[smp_processor_id()].in_hit++;
1731                        read_unlock(&rt_hash_table[hash].lock);
1732                        skb->dst = (struct dst_entry*)rth;
1733                        return 0;
1734                }
1735                rt_cache_stat[smp_processor_id()].in_hlist_search++;
1736        }
1737        read_unlock(&rt_hash_table[hash].lock);
1738
1739        /* Multicast recognition logic is moved from route cache to here.
1740           The problem was that too many Ethernet cards have broken/missing
1741           hardware multicast filters :-( As result the host on multicasting
1742           network acquires a lot of useless route cache entries, sort of
1743           SDR messages from all the world. Now we try to get rid of them.
1744           Really, provided software IP multicast filter is organized
1745           reasonably (at least, hashed), it does not result in a slowdown
1746           comparing with route cache reject entries.
1747           Note, that multicast routers are not affected, because
1748           route cache entry is created eventually.
1749         */
1750        if (MULTICAST(daddr)) {
1751                struct in_device *in_dev;
1752
1753                read_lock(&inetdev_lock);
1754                if ((in_dev = __in_dev_get(dev)) != NULL) {
1755                        int our = ip_check_mc(in_dev, daddr, saddr);
1756                        if (our
1757#ifdef CONFIG_IP_MROUTE
1758                            || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1759#endif
1760                            ) {
1761                                read_unlock(&inetdev_lock);
1762                                return ip_route_input_mc(skb, daddr, saddr,
1763                                                         tos, dev, our);
1764                        }
1765                }
1766                read_unlock(&inetdev_lock);
1767                return -EINVAL;
1768        }
1769        return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1770}
1771
1772/*
1773 * Major route resolver routine.
1774 */
1775
1776int ip_route_output_slow(struct rtable **rp, const struct rt_key *oldkey)
1777{
1778        struct rt_key key;
1779        struct fib_result res;
1780        unsigned flags = 0;
1781        struct rtable *rth;
1782        struct net_device *dev_out = NULL;
1783        unsigned hash;
1784        int free_res = 0;
1785        int err;
1786        u32 tos;
1787
1788        tos             = oldkey->tos & (IPTOS_RT_MASK | RTO_ONLINK);
1789        key.dst         = oldkey->dst;
1790        key.src         = oldkey->src;
1791        key.tos         = tos & IPTOS_RT_MASK;
1792        key.iif         = loopback_dev.ifindex;
1793        key.oif         = oldkey->oif;
1794#ifdef CONFIG_IP_ROUTE_FWMARK
1795        key.fwmark      = oldkey->fwmark;
1796#endif
1797        key.scope       = (tos & RTO_ONLINK) ? RT_SCOPE_LINK :
1798                                                RT_SCOPE_UNIVERSE;
1799        res.fi          = NULL;
1800#ifdef CONFIG_IP_MULTIPLE_TABLES
1801        res.r           = NULL;
1802#endif
1803
1804        if (oldkey->src) {
1805                err = -EINVAL;
1806                if (MULTICAST(oldkey->src) ||
1807                    BADCLASS(oldkey->src) ||
1808                    ZERONET(oldkey->src))
1809                        goto out;
1810
1811                /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1812                dev_out = ip_dev_find(oldkey->src);
1813                if (dev_out == NULL)
1814                        goto out;
1815
1816                /* I removed check for oif == dev_out->oif here.
1817                   It was wrong by three reasons:
1818                   1. ip_dev_find(saddr) can return wrong iface, if saddr is
1819                      assigned to multiple interfaces.
1820                   2. Moreover, we are allowed to send packets with saddr
1821                      of another iface. --ANK
1822                 */
1823
1824                if (oldkey->oif == 0
1825                    && (MULTICAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF)) {
1826                        /* Special hack: user can direct multicasts
1827                           and limited broadcast via necessary interface
1828                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1829                           This hack is not just for fun, it allows
1830                           vic,vat and friends to work.
1831                           They bind socket to loopback, set ttl to zero
1832                           and expect that it will work.
1833                           From the viewpoint of routing cache they are broken,
1834                           because we are not allowed to build multicast path
1835                           with loopback source addr (look, routing cache
1836                           cannot know, that ttl is zero, so that packet
1837                           will not leave this host and route is valid).
1838                           Luckily, this hack is good workaround.
1839                         */
1840
1841                        key.oif = dev_out->ifindex;
1842                        goto make_route;
1843                }
1844                if (dev_out)
1845                        dev_put(dev_out);
1846                dev_out = NULL;
1847        }
1848        if (oldkey->oif) {
1849                dev_out = dev_get_by_index(oldkey->oif);
1850                err = -ENODEV;
1851                if (dev_out == NULL)
1852                        goto out;
1853                if (__in_dev_get(dev_out) == NULL) {
1854                        dev_put(dev_out);
1855                        goto out;       /* Wrong error code */
1856                }
1857
1858                if (LOCAL_MCAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF) {
1859                        if (!key.src)
1860                                key.src = inet_select_addr(dev_out, 0,
1861                                                                RT_SCOPE_LINK);
1862                        goto make_route;
1863                }
1864                if (!key.src) {
1865                        if (MULTICAST(oldkey->dst))
1866                                key.src = inet_select_addr(dev_out, 0,
1867                                                                key.scope);
1868                        else if (!oldkey->dst)
1869                                key.src = inet_select_addr(dev_out, 0,
1870                                                                RT_SCOPE_HOST);
1871                }
1872        }
1873
1874        if (!key.dst) {
1875                key.dst = key.src;
1876                if (!key.dst)
1877                        key.dst = key.src = htonl(INADDR_LOOPBACK);
1878                if (dev_out)
1879                        dev_put(dev_out);
1880                dev_out = &loopback_dev;
1881                dev_hold(dev_out);
1882                key.oif = loopback_dev.ifindex;
1883                res.type = RTN_LOCAL;
1884                flags |= RTCF_LOCAL;
1885                goto make_route;
1886        }
1887
1888        if (fib_lookup(&key, &res)) {
1889                res.fi = NULL;
1890                if (oldkey->oif) {
1891                        /* Apparently, routing tables are wrong. Assume,
1892                           that the destination is on link.
1893
1894                           WHY? DW.
1895                           Because we are allowed to send to iface
1896                           even if it has NO routes and NO assigned
1897                           addresses. When oif is specified, routing
1898                           tables are looked up with only one purpose:
1899                           to catch if destination is gatewayed, rather than
1900                           direct. Moreover, if MSG_DONTROUTE is set,
1901                           we send packet, ignoring both routing tables
1902                           and ifaddr state. --ANK
1903
1904
1905                           We could make it even if oif is unknown,
1906                           likely IPv6, but we do not.
1907                         */
1908
1909                        if (key.src == 0)
1910                                key.src = inet_select_addr(dev_out, 0,
1911                                                           RT_SCOPE_LINK);
1912                        res.type = RTN_UNICAST;
1913                        goto make_route;
1914                }
1915                if (dev_out)
1916                        dev_put(dev_out);
1917                err = -ENETUNREACH;
1918                goto out;
1919        }
1920        free_res = 1;
1921
1922        if (res.type == RTN_NAT)
1923                goto e_inval;
1924
1925        if (res.type == RTN_LOCAL) {
1926                if (!key.src)
1927                        key.src = key.dst;
1928                if (dev_out)
1929                        dev_put(dev_out);
1930                dev_out = &loopback_dev;
1931                dev_hold(dev_out);
1932                key.oif = dev_out->ifindex;
1933                if (res.fi)
1934                        fib_info_put(res.fi);
1935                res.fi = NULL;
1936                flags |= RTCF_LOCAL;
1937                goto make_route;
1938        }
1939
1940#ifdef CONFIG_IP_ROUTE_MULTIPATH
1941        if (res.fi->fib_nhs > 1 && key.oif == 0)
1942                fib_select_multipath(&key, &res);
1943        else
1944#endif
1945        if (!res.prefixlen && res.type == RTN_UNICAST && !key.oif)
1946                fib_select_default(&key, &res);
1947
1948        if (!key.src)
1949                key.src = FIB_RES_PREFSRC(res);
1950
1951        if (dev_out)
1952                dev_put(dev_out);
1953        dev_out = FIB_RES_DEV(res);
1954        dev_hold(dev_out);
1955        key.oif = dev_out->ifindex;
1956
1957make_route:
1958        if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK))
1959                goto e_inval;
1960
1961        if (key.dst == 0xFFFFFFFF)
1962                res.type = RTN_BROADCAST;
1963        else if (MULTICAST(key.dst))
1964                res.type = RTN_MULTICAST;
1965        else if (BADCLASS(key.dst) || ZERONET(key.dst))
1966                goto e_inval;
1967
1968        if (dev_out->flags & IFF_LOOPBACK)
1969                flags |= RTCF_LOCAL;
1970
1971        if (res.type == RTN_BROADCAST) {
1972                flags |= RTCF_BROADCAST | RTCF_LOCAL;
1973                if (res.fi) {
1974                        fib_info_put(res.fi);
1975                        res.fi = NULL;
1976                }
1977        } else if (res.type == RTN_MULTICAST) {
1978                flags |= RTCF_MULTICAST|RTCF_LOCAL;
1979                read_lock(&inetdev_lock);
1980                if (!__in_dev_get(dev_out) ||
1981                    !ip_check_mc(__in_dev_get(dev_out),oldkey->dst,oldkey->src))
1982                        flags &= ~RTCF_LOCAL;
1983                read_unlock(&inetdev_lock);
1984                /* If multicast route do not exist use
1985                   default one, but do not gateway in this case.
1986                   Yes, it is hack.
1987                 */
1988                if (res.fi && res.prefixlen < 4) {
1989                        fib_info_put(res.fi);
1990                        res.fi = NULL;
1991                }
1992        }
1993
1994        rth = dst_alloc(&ipv4_dst_ops);
1995        if (!rth)
1996                goto e_nobufs;
1997
1998        atomic_set(&rth->u.dst.__refcnt, 1);
1999        rth->u.dst.flags= DST_HOST;
2000        rth->key.dst    = oldkey->dst;
2001        rth->key.tos    = tos;
2002        rth->key.src    = oldkey->src;
2003        rth->key.iif    = 0;
2004        rth->key.oif    = oldkey->oif;
2005#ifdef CONFIG_IP_ROUTE_FWMARK
2006        rth->key.fwmark = oldkey->fwmark;
2007#endif
2008        rth->rt_dst     = key.dst;
2009        rth->rt_src     = key.src;
2010#ifdef CONFIG_IP_ROUTE_NAT
2011        rth->rt_dst_map = key.dst;
2012        rth->rt_src_map = key.src;
2013#endif
2014        rth->rt_iif     = oldkey->oif ? : dev_out->ifindex;
2015        rth->u.dst.dev  = dev_out;
2016        dev_hold(dev_out);
2017        rth->rt_gateway = key.dst;
2018        rth->rt_spec_dst= key.src;
2019
2020        rth->u.dst.output=ip_output;
2021
2022        rt_cache_stat[smp_processor_id()].out_slow_tot++;
2023
2024        if (flags & RTCF_LOCAL) {
2025                rth->u.dst.input = ip_local_deliver;
2026                rth->rt_spec_dst = key.dst;
2027        }
2028        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2029                rth->rt_spec_dst = key.src;
2030                if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) {
2031                        rth->u.dst.output = ip_mc_output;
2032                        rt_cache_stat[smp_processor_id()].out_slow_mc++;
2033                }
2034#ifdef CONFIG_IP_MROUTE
2035                if (res.type == RTN_MULTICAST) {
2036                        struct in_device *in_dev = in_dev_get(dev_out);
2037                        if (in_dev) {
2038                                if (IN_DEV_MFORWARD(in_dev) &&
2039                                    !LOCAL_MCAST(oldkey->dst)) {
2040                                        rth->u.dst.input = ip_mr_input;
2041                                        rth->u.dst.output = ip_mc_output;
2042                                }
2043                                in_dev_put(in_dev);
2044                        }
2045                }
2046#endif
2047        }
2048
2049        rt_set_nexthop(rth, &res, 0);
2050
2051        rth->rt_flags = flags;
2052
2053        hash = rt_hash_code(oldkey->dst, oldkey->src ^ (oldkey->oif << 5), tos);
2054        err = rt_intern_hash(hash, rth, rp);
2055done:
2056        if (free_res)
2057                fib_res_put(&res);
2058        if (dev_out)
2059                dev_put(dev_out);
2060out:    return err;
2061
2062e_inval:
2063        err = -EINVAL;
2064        goto done;
2065e_nobufs:
2066        err = -ENOBUFS;
2067        goto done;
2068}
2069
2070int ip_route_output_key(struct rtable **rp, const struct rt_key *key)
2071{
2072        unsigned hash;
2073        struct rtable *rth;
2074
2075        hash = rt_hash_code(key->dst, key->src ^ (key->oif << 5), key->tos);
2076
2077        read_lock_bh(&rt_hash_table[hash].lock);
2078        for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
2079                if (rth->key.dst == key->dst &&
2080                    rth->key.src == key->src &&
2081                    rth->key.iif == 0 &&
2082                    rth->key.oif == key->oif &&
2083#ifdef CONFIG_IP_ROUTE_FWMARK
2084                    rth->key.fwmark == key->fwmark &&
2085#endif
2086                    !((rth->key.tos ^ key->tos) &
2087                            (IPTOS_RT_MASK | RTO_ONLINK))) {
2088                        rth->u.dst.lastuse = jiffies;
2089                        dst_hold(&rth->u.dst);
2090                        rth->u.dst.__use++;
2091                        rt_cache_stat[smp_processor_id()].out_hit++;
2092                        read_unlock_bh(&rt_hash_table[hash].lock);
2093                        *rp = rth;
2094                        return 0;
2095                }
2096                rt_cache_stat[smp_processor_id()].out_hlist_search++;
2097        }
2098        read_unlock_bh(&rt_hash_table[hash].lock);
2099
2100        return ip_route_output_slow(rp, key);
2101}       
2102
2103static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2104                        int nowait)
2105{
2106        struct rtable *rt = (struct rtable*)skb->dst;
2107        struct rtmsg *r;
2108        struct nlmsghdr  *nlh;
2109        unsigned char    *b = skb->tail;
2110        struct rta_cacheinfo ci;
2111#ifdef CONFIG_IP_MROUTE
2112        struct rtattr *eptr;
2113#endif
2114        nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
2115        r = NLMSG_DATA(nlh);
2116        nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2117        r->rtm_family    = AF_INET;
2118        r->rtm_dst_len  = 32;
2119        r->rtm_src_len  = 0;
2120        r->rtm_tos      = rt->key.tos;
2121        r->rtm_table    = RT_TABLE_MAIN;
2122        r->rtm_type     = rt->rt_type;
2123        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2124        r->rtm_protocol = RTPROT_UNSPEC;
2125        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2126        if (rt->rt_flags & RTCF_NOTIFY)
2127                r->rtm_flags |= RTM_F_NOTIFY;
2128        RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2129        if (rt->key.src) {
2130                r->rtm_src_len = 32;
2131                RTA_PUT(skb, RTA_SRC, 4, &rt->key.src);
2132        }
2133        if (rt->u.dst.dev)
2134                RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2135#ifdef CONFIG_NET_CLS_ROUTE
2136        if (rt->u.dst.tclassid)
2137                RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2138#endif
2139        if (rt->key.iif)
2140                RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2141        else if (rt->rt_src != rt->key.src)
2142                RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2143        if (rt->rt_dst != rt->rt_gateway)
2144                RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2145        if (rtnetlink_put_metrics(skb, &rt->u.dst.mxlock) < 0)
2146                goto rtattr_failure;
2147        ci.rta_lastuse  = jiffies - rt->u.dst.lastuse;
2148        ci.rta_used     = rt->u.dst.__use;
2149        ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2150        if (rt->u.dst.expires)
2151                ci.rta_expires = rt->u.dst.expires - jiffies;
2152        else
2153                ci.rta_expires = 0;
2154        ci.rta_error    = rt->u.dst.error;
2155        ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2156        if (rt->peer) {
2157                ci.rta_id = rt->peer->ip_id_count;
2158                if (rt->peer->tcp_ts_stamp) {
2159                        ci.rta_ts = rt->peer->tcp_ts;
2160                        ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2161                }
2162        }
2163#ifdef CONFIG_IP_MROUTE
2164        eptr = (struct rtattr*)skb->tail;
2165#endif
2166        RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2167        if (rt->key.iif) {
2168#ifdef CONFIG_IP_MROUTE
2169                u32 dst = rt->rt_dst;
2170
2171                if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2172                    ipv4_devconf.mc_forwarding) {
2173                        int err = ipmr_get_route(skb, r, nowait);
2174                        if (err <= 0) {
2175                                if (!nowait) {
2176                                        if (err == 0)
2177                                                return 0;
2178                                        goto nlmsg_failure;
2179                                } else {
2180                                        if (err == -EMSGSIZE)
2181                                                goto nlmsg_failure;
2182                                        ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2183                                }
2184                        }
2185                } else
2186#endif
2187                        RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif);
2188        }
2189
2190        nlh->nlmsg_len = skb->tail - b;
2191        return skb->len;
2192
2193nlmsg_failure:
2194rtattr_failure:
2195        skb_trim(skb, b - skb->data);
2196        return -1;
2197}
2198
2199int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2200{
2201        struct rtattr **rta = arg;
2202        struct rtmsg *rtm = NLMSG_DATA(nlh);
2203        struct rtable *rt = NULL;
2204        u32 dst = 0;
2205        u32 src = 0;
2206        int iif = 0;
2207        int err = -ENOBUFS;
2208        struct sk_buff *skb;
2209
2210        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2211        if (!skb)
2212                goto out;
2213
2214        /* Reserve room for dummy headers, this skb can pass
2215           through good chunk of routing engine.
2216         */
2217        skb->mac.raw = skb->data;
2218        skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2219
2220        if (rta[RTA_SRC - 1])
2221                memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2222        if (rta[RTA_DST - 1])
2223                memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2224        if (rta[RTA_IIF - 1])
2225                memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2226
2227        if (iif) {
2228                struct net_device *dev = __dev_get_by_index(iif);
2229                err = -ENODEV;
2230                if (!dev)
2231                        goto out_free;
2232                skb->protocol   = htons(ETH_P_IP);
2233                skb->dev        = dev;
2234                local_bh_disable();
2235                err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2236                local_bh_enable();
2237                rt = (struct rtable*)skb->dst;
2238                if (!err && rt->u.dst.error)
2239                        err = -rt->u.dst.error;
2240        } else {
2241                int oif = 0;
2242                if (rta[RTA_OIF - 1])
2243                        memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2244                err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif);
2245        }
2246        if (err)
2247                goto out_free;
2248
2249        skb->dst = &rt->u.dst;
2250        if (rtm->rtm_flags & RTM_F_NOTIFY)
2251                rt->rt_flags |= RTCF_NOTIFY;
2252
2253        NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2254
2255        err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2256                                RTM_NEWROUTE, 0);
2257        if (!err)
2258                goto out_free;
2259        if (err < 0) {
2260                err = -EMSGSIZE;
2261                goto out_free;
2262        }
2263
2264        err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2265        if (err > 0)
2266                err = 0;
2267out:    return err;
2268
2269out_free:
2270        kfree_skb(skb);
2271        goto out;
2272}
2273
2274int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2275{
2276        struct rtable *rt;
2277        int h, s_h;
2278        int idx, s_idx;
2279
2280        s_h = cb->args[0];
2281        s_idx = idx = cb->args[1];
2282        for (h = 0; h <= rt_hash_mask; h++) {
2283                if (h < s_h) continue;
2284                if (h > s_h)
2285                        s_idx = 0;
2286                read_lock_bh(&rt_hash_table[h].lock);
2287                for (rt = rt_hash_table[h].chain, idx = 0; rt;
2288                     rt = rt->u.rt_next, idx++) {
2289                        if (idx < s_idx)
2290                                continue;
2291                        skb->dst = dst_clone(&rt->u.dst);
2292                        if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2293                                         cb->nlh->nlmsg_seq,
2294                                         RTM_NEWROUTE, 1) <= 0) {
2295                                dst_release(xchg(&skb->dst, NULL));
2296                                read_unlock_bh(&rt_hash_table[h].lock);
2297                                goto done;
2298                        }
2299                        dst_release(xchg(&skb->dst, NULL));
2300                }
2301                read_unlock_bh(&rt_hash_table[h].lock);
2302        }
2303
2304done:
2305        cb->args[0] = h;
2306        cb->args[1] = idx;
2307        return skb->len;
2308}
2309
2310void ip_rt_multicast_event(struct in_device *in_dev)
2311{
2312        rt_cache_flush(0);
2313}
2314
2315#ifdef CONFIG_SYSCTL
2316static int flush_delay;
2317
2318static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2319                                        struct file *filp, void *buffer,
2320                                        size_t *lenp)
2321{
2322        if (write) {
2323                proc_dointvec(ctl, write, filp, buffer, lenp);
2324                rt_cache_flush(flush_delay);
2325                return 0;
2326        } 
2327
2328        return -EINVAL;
2329}
2330
2331static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, int *name,
2332                                                int nlen, void *oldval,
2333                                                size_t *oldlenp, void *newval,
2334                                                size_t newlen, void **context)
2335{
2336        int delay;
2337        if (newlen != sizeof(int))
2338                return -EINVAL;
2339        if (get_user(delay, (int *)newval))
2340                return -EFAULT; 
2341        rt_cache_flush(delay); 
2342        return 0;
2343}
2344
2345ctl_table ipv4_route_table[] = {
2346        {
2347                ctl_name:       NET_IPV4_ROUTE_FLUSH,
2348                procname:       "flush",
2349                data:           &flush_delay,
2350                maxlen:         sizeof(int),
2351                mode:           0644,
2352                proc_handler:   &ipv4_sysctl_rtcache_flush,
2353                strategy:       &ipv4_sysctl_rtcache_flush_strategy,
2354        },
2355        {
2356                ctl_name:       NET_IPV4_ROUTE_MIN_DELAY,
2357                procname:       "min_delay",
2358                data:           &ip_rt_min_delay,
2359                maxlen:         sizeof(int),
2360                mode:           0644,
2361                proc_handler:   &proc_dointvec_jiffies,
2362                strategy:       &sysctl_jiffies,
2363        },
2364        {
2365                ctl_name:       NET_IPV4_ROUTE_MAX_DELAY,
2366                procname:       "max_delay",
2367                data:           &ip_rt_max_delay,
2368                maxlen:         sizeof(int),
2369                mode:           0644,
2370                proc_handler:   &proc_dointvec_jiffies,
2371                strategy:       &sysctl_jiffies,
2372        },
2373        {
2374                ctl_name:       NET_IPV4_ROUTE_GC_THRESH,
2375                procname:       "gc_thresh",
2376                data:           &ipv4_dst_ops.gc_thresh,
2377                maxlen:         sizeof(int),
2378                mode:           0644,
2379                proc_handler:   &proc_dointvec,
2380        },
2381        {
2382                ctl_name:       NET_IPV4_ROUTE_MAX_SIZE,
2383                procname:       "max_size",
2384                data:           &ip_rt_max_size,
2385                maxlen:         sizeof(int),
2386                mode:           0644,
2387                proc_handler:   &proc_dointvec,
2388        },
2389        {
2390                ctl_name:       NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2391                procname:       "gc_min_interval",
2392                data:           &ip_rt_gc_min_interval,
2393                maxlen:         sizeof(int),
2394                mode:           0644,
2395                proc_handler:   &proc_dointvec_jiffies,
2396                strategy:       &sysctl_jiffies,
2397        },
2398        {
2399                ctl_name:       NET_IPV4_ROUTE_GC_TIMEOUT,
2400                procname:       "gc_timeout",
2401                data:           &ip_rt_gc_timeout,
2402                maxlen:         sizeof(int),
2403                mode:           0644,
2404                proc_handler:   &proc_dointvec_jiffies,
2405                strategy:       &sysctl_jiffies,
2406        },
2407        {
2408                ctl_name:       NET_IPV4_ROUTE_GC_INTERVAL,
2409                procname:       "gc_interval",
2410                data:           &ip_rt_gc_interval,
2411                maxlen:         sizeof(int),
2412                mode:           0644,
2413                proc_handler:   &proc_dointvec_jiffies,
2414                strategy:       &sysctl_jiffies,
2415        },
2416        {
2417                ctl_name:       NET_IPV4_ROUTE_REDIRECT_LOAD,
2418                procname:       "redirect_load",
2419                data:           &ip_rt_redirect_load,
2420                maxlen:         sizeof(int),
2421                mode:           0644,
2422                proc_handler:   &proc_dointvec,
2423        },
2424        {
2425                ctl_name:       NET_IPV4_ROUTE_REDIRECT_NUMBER,
2426                procname:       "redirect_number",
2427                data:           &ip_rt_redirect_number,
2428                maxlen:         sizeof(int),
2429                mode:           0644,
2430                proc_handler:   &proc_dointvec,
2431        },
2432        {
2433                ctl_name:       NET_IPV4_ROUTE_REDIRECT_SILENCE,
2434                procname:       "redirect_silence",
2435                data:           &ip_rt_redirect_silence,
2436                maxlen:         sizeof(int),
2437                mode:           0644,
2438                proc_handler:   &proc_dointvec,
2439        },
2440        {
2441                ctl_name:       NET_IPV4_ROUTE_ERROR_COST,
2442                procname:       "error_cost",
2443                data:           &ip_rt_error_cost,
2444                maxlen:         sizeof(int),
2445                mode:           0644,
2446                proc_handler:   &proc_dointvec,
2447        },
2448        {
2449                ctl_name:       NET_IPV4_ROUTE_ERROR_BURST,
2450                procname:       "error_burst",
2451                data:           &ip_rt_error_burst,
2452                maxlen:         sizeof(int),
2453                mode:           0644,
2454                proc_handler:   &proc_dointvec,
2455        },
2456        {
2457                ctl_name:       NET_IPV4_ROUTE_GC_ELASTICITY,
2458                procname:       "gc_elasticity",
2459                data:           &ip_rt_gc_elasticity,
2460                maxlen:         sizeof(int),
2461                mode:           0644,
2462                proc_handler:   &proc_dointvec,
2463        },
2464        {
2465                ctl_name:       NET_IPV4_ROUTE_MTU_EXPIRES,
2466                procname:       "mtu_expires",
2467                data:           &ip_rt_mtu_expires,
2468                maxlen:         sizeof(int),
2469                mode:           0644,
2470                proc_handler:   &proc_dointvec_jiffies,
2471                strategy:       &sysctl_jiffies,
2472        },
2473        {
2474                ctl_name:       NET_IPV4_ROUTE_MIN_PMTU,
2475                procname:       "min_pmtu",
2476                data:           &ip_rt_min_pmtu,
2477                maxlen:         sizeof(int),
2478                mode:           0644,
2479                proc_handler:   &proc_dointvec,
2480        },
2481        {
2482                ctl_name:       NET_IPV4_ROUTE_MIN_ADVMSS,
2483                procname:       "min_adv_mss",
2484                data:           &ip_rt_min_advmss,
2485                maxlen:         sizeof(int),
2486                mode:           0644,
2487                proc_handler:   &proc_dointvec,
2488        },
2489        {
2490                ctl_name:       NET_IPV4_ROUTE_SECRET_INTERVAL,
2491                procname:       "secret_interval",
2492                data:           &ip_rt_secret_interval,
2493                maxlen:         sizeof(int),
2494                mode:           0644,
2495                proc_handler:   &proc_dointvec_jiffies,
2496                strategy:       &sysctl_jiffies,
2497        },
2498         { 0 }
2499};
2500#endif
2501
2502#ifdef CONFIG_NET_CLS_ROUTE
2503struct ip_rt_acct *ip_rt_acct;
2504
2505/* This code sucks.  But you should have seen it before! --RR */
2506
2507/* IP route accounting ptr for this logical cpu number. */
2508#define IP_RT_ACCT_CPU(i) (ip_rt_acct + cpu_logical_map(i) * 256)
2509
2510static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2511                           int length, int *eof, void *data)
2512{
2513        unsigned int i;
2514
2515        if ((offset & 3) || (length & 3))
2516                return -EIO;
2517
2518        if (offset >= sizeof(struct ip_rt_acct) * 256) {
2519                *eof = 1;
2520                return 0;
2521        }
2522
2523        if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2524                length = sizeof(struct ip_rt_acct) * 256 - offset;
2525                *eof = 1;
2526        }
2527
2528        offset /= sizeof(u32);
2529
2530        if (length > 0) {
2531                u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2532                u32 *dst = (u32 *) buffer;
2533
2534                /* Copy first cpu. */
2535                *start = buffer;
2536                memcpy(dst, src, length);
2537
2538                /* Add the other cpus in, one int at a time */
2539                for (i = 1; i < smp_num_cpus; i++) {
2540                        unsigned int j;
2541
2542                        src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2543
2544                        for (j = 0; j < length/4; j++)
2545                                dst[j] += src[j];
2546                }
2547        }
2548        return length;
2549}
2550#endif
2551
2552void __init ip_rt_init(void)
2553{
2554        int i, order, goal;
2555
2556        rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2557                             (jiffies ^ (jiffies >> 7)));
2558
2559#ifdef CONFIG_NET_CLS_ROUTE
2560        for (order = 0;
2561             (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2562                /* NOTHING */;
2563        ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2564        if (!ip_rt_acct)
2565                panic("IP: failed to allocate ip_rt_acct\n");
2566        memset(ip_rt_acct, 0, PAGE_SIZE << order);
2567#endif
2568
2569        ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
2570                                                     sizeof(struct rtable),
2571                                                     0, SLAB_HWCACHE_ALIGN,
2572                                                     NULL, NULL);
2573
2574        if (!ipv4_dst_ops.kmem_cachep)
2575                panic("IP: failed to allocate ip_dst_cache\n");
2576
2577        goal = num_physpages >> (26 - PAGE_SHIFT);
2578
2579        for (order = 0; (1UL << order) < goal; order++)
2580                /* NOTHING */;
2581
2582        do {
2583                rt_hash_mask = (1UL << order) * PAGE_SIZE /
2584                        sizeof(struct rt_hash_bucket);
2585                while (rt_hash_mask & (rt_hash_mask - 1))
2586                        rt_hash_mask--;
2587                rt_hash_table = (struct rt_hash_bucket *)
2588                        __get_free_pages(GFP_ATOMIC, order);
2589        } while (rt_hash_table == NULL && --order > 0);
2590
2591        if (!rt_hash_table)
2592                panic("Failed to allocate IP route cache hash table\n");
2593
2594        printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
2595               rt_hash_mask,
2596               (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
2597
2598        for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
2599                /* NOTHING */;
2600
2601        rt_hash_mask--;
2602        for (i = 0; i <= rt_hash_mask; i++) {
2603                rt_hash_table[i].lock = RW_LOCK_UNLOCKED;
2604                rt_hash_table[i].chain = NULL;
2605        }
2606
2607        ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2608        ip_rt_max_size = (rt_hash_mask + 1) * 16;
2609
2610        devinet_init();
2611        ip_fib_init();
2612
2613        rt_flush_timer.function = rt_run_flush;
2614        rt_periodic_timer.function = rt_check_expire;
2615        rt_secret_timer.function = rt_secret_rebuild;
2616
2617        /* All the timers, started at system startup tend
2618           to synchronize. Perturb it a bit.
2619         */
2620        rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
2621                                        ip_rt_gc_interval;
2622        add_timer(&rt_periodic_timer);
2623
2624        rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2625                ip_rt_secret_interval;
2626        add_timer(&rt_secret_timer);
2627
2628        proc_net_create ("rt_cache", 0, rt_cache_get_info);
2629        create_proc_info_entry ("rt_cache", 0, proc_net_stat, 
2630                                rt_cache_stat_get_info);
2631#ifdef CONFIG_NET_CLS_ROUTE
2632        create_proc_read_entry("net/rt_acct", 0, 0, ip_rt_acct_read, NULL);
2633#endif
2634}
2635
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.