linux-old/net/ipv4/route.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              ROUTE - implementation of the IP router.
   7 *
   8 * Version:     $Id: route.c,v 1.65 1999/03/25 10:04:35 davem Exp $
   9 *
  10 * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  13 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  14 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  15 *
  16 * Fixes:
  17 *              Alan Cox        :       Verify area fixes.
  18 *              Alan Cox        :       cli() protects routing changes
  19 *              Rui Oliveira    :       ICMP routing table updates
  20 *              (rco@di.uminho.pt)      Routing table insertion and update
  21 *              Linus Torvalds  :       Rewrote bits to be sensible
  22 *              Alan Cox        :       Added BSD route gw semantics
  23 *              Alan Cox        :       Super /proc >4K 
  24 *              Alan Cox        :       MTU in route table
  25 *              Alan Cox        :       MSS actually. Also added the window
  26 *                                      clamper.
  27 *              Sam Lantinga    :       Fixed route matching in rt_del()
  28 *              Alan Cox        :       Routing cache support.
  29 *              Alan Cox        :       Removed compatibility cruft.
  30 *              Alan Cox        :       RTF_REJECT support.
  31 *              Alan Cox        :       TCP irtt support.
  32 *              Jonathan Naylor :       Added Metric support.
  33 *      Miquel van Smoorenburg  :       BSD API fixes.
  34 *      Miquel van Smoorenburg  :       Metrics.
  35 *              Alan Cox        :       Use __u32 properly
  36 *              Alan Cox        :       Aligned routing errors more closely with BSD
  37 *                                      our system is still very different.
  38 *              Alan Cox        :       Faster /proc handling
  39 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  40 *                                      routing caches and better behaviour.
  41 *              
  42 *              Olaf Erb        :       irtt wasn't being copied right.
  43 *              Bjorn Ekwall    :       Kerneld route support.
  44 *              Alan Cox        :       Multicast fixed (I hope)
  45 *              Pavel Krauz     :       Limited broadcast fixed
  46 *              Mike McLagan    :       Routing by source
  47 *      Alexey Kuznetsov        :       End of old history. Splitted to fib.c and
  48 *                                      route.c and rewritten from scratch.
  49 *              Andi Kleen      :       Load-limit warning messages.
  50 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  51 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  52 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  53 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  54 *              Marc Boucher    :       routing by fwmark
  55 *
  56 *              This program is free software; you can redistribute it and/or
  57 *              modify it under the terms of the GNU General Public License
  58 *              as published by the Free Software Foundation; either version
  59 *              2 of the License, or (at your option) any later version.
  60 */
  61
  62#include <linux/config.h>
  63#include <asm/uaccess.h>
  64#include <asm/system.h>
  65#include <asm/bitops.h>
  66#include <linux/types.h>
  67#include <linux/kernel.h>
  68#include <linux/sched.h>
  69#include <linux/mm.h>
  70#include <linux/string.h>
  71#include <linux/socket.h>
  72#include <linux/sockios.h>
  73#include <linux/errno.h>
  74#include <linux/in.h>
  75#include <linux/inet.h>
  76#include <linux/netdevice.h>
  77#include <linux/proc_fs.h>
  78#include <linux/init.h>
  79#include <linux/skbuff.h>
  80#include <linux/rtnetlink.h>
  81#include <linux/inetdevice.h>
  82#include <linux/igmp.h>
  83#include <linux/pkt_sched.h>
  84#include <linux/mroute.h>
  85#include <net/protocol.h>
  86#include <net/ip.h>
  87#include <net/route.h>
  88#include <net/sock.h>
  89#include <net/ip_fib.h>
  90#include <net/arp.h>
  91#include <net/tcp.h>
  92#include <net/icmp.h>
  93#ifdef CONFIG_SYSCTL
  94#include <linux/sysctl.h>
  95#endif
  96
  97#define IP_MAX_MTU      0xFFF0
  98
  99#define RT_GC_TIMEOUT (300*HZ)
 100
 101int ip_rt_min_delay = 2*HZ;
 102int ip_rt_max_delay = 10*HZ;
 103int ip_rt_gc_thresh = RT_HASH_DIVISOR;
 104int ip_rt_max_size = RT_HASH_DIVISOR*16;
 105int ip_rt_gc_timeout = RT_GC_TIMEOUT;
 106int ip_rt_gc_interval = 60*HZ;
 107int ip_rt_gc_min_interval = 5*HZ;
 108int ip_rt_redirect_number = 9;
 109int ip_rt_redirect_load = HZ/50;
 110int ip_rt_redirect_silence = ((HZ/50) << (9+1));
 111int ip_rt_error_cost = HZ;
 112int ip_rt_error_burst = 5*HZ;
 113int ip_rt_gc_elasticity = 8;
 114int ip_rt_mtu_expires = 10*60*HZ;
 115
 116static unsigned long rt_deadline = 0;
 117
 118#define RTprint(a...)   printk(KERN_DEBUG a)
 119
 120static void rt_run_flush(unsigned long dummy);
 121
 122static struct timer_list rt_flush_timer =
 123        { NULL, NULL, 0, 0L, rt_run_flush };
 124static struct timer_list rt_periodic_timer =
 125        { NULL, NULL, 0, 0L, NULL };
 126
 127/*
 128 *      Interface to generic destination cache.
 129 */
 130
 131static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32);
 132static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst,
 133                                           struct sk_buff *);
 134static struct dst_entry * ipv4_negative_advice(struct dst_entry *);
 135static void               ipv4_link_failure(struct sk_buff *skb);
 136static int rt_garbage_collect(void);
 137
 138
 139struct dst_ops ipv4_dst_ops =
 140{
 141        AF_INET,
 142        __constant_htons(ETH_P_IP),
 143        RT_HASH_DIVISOR,
 144
 145        rt_garbage_collect,
 146        ipv4_dst_check,
 147        ipv4_dst_reroute,
 148        NULL,
 149        ipv4_negative_advice,
 150        ipv4_link_failure,
 151};
 152
 153__u8 ip_tos2prio[16] = {
 154        TC_PRIO_BESTEFFORT,
 155        TC_PRIO_FILLER,
 156        TC_PRIO_BESTEFFORT,
 157        TC_PRIO_FILLER,
 158        TC_PRIO_BULK,
 159        TC_PRIO_FILLER,
 160        TC_PRIO_BULK,
 161        TC_PRIO_FILLER,
 162        TC_PRIO_INTERACTIVE,
 163        TC_PRIO_FILLER,
 164        TC_PRIO_INTERACTIVE,
 165        TC_PRIO_FILLER,
 166        TC_PRIO_INTERACTIVE_BULK,
 167        TC_PRIO_FILLER,
 168        TC_PRIO_INTERACTIVE_BULK,
 169        TC_PRIO_FILLER
 170};
 171
 172
 173/*
 174 * Route cache.
 175 */
 176
 177struct rtable   *rt_hash_table[RT_HASH_DIVISOR];
 178
 179static int rt_intern_hash(unsigned hash, struct rtable * rth, struct rtable ** res);
 180
 181static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos)
 182{
 183        unsigned hash = ((daddr&0xF0F0F0F0)>>4)|((daddr&0x0F0F0F0F)<<4);
 184        hash = hash^saddr^tos;
 185        hash = hash^(hash>>16);
 186        return (hash^(hash>>8)) & 0xFF;
 187}
 188
 189#ifdef CONFIG_PROC_FS
 190
 191static int rt_cache_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
 192{
 193        int len=0;
 194        off_t pos=0;
 195        char temp[129];
 196        struct rtable *r;
 197        int i;
 198
 199        pos = 128;
 200
 201        if (offset<128) {
 202                sprintf(buffer,"%-127s\n", "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\tMetric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\tHHUptod\tSpecDst");
 203                len = 128;
 204        }
 205        
 206        
 207        start_bh_atomic();
 208
 209        for (i = 0; i<RT_HASH_DIVISOR; i++) {
 210                for (r = rt_hash_table[i]; r; r = r->u.rt_next) {
 211                        /*
 212                         *      Spin through entries until we are ready
 213                         */
 214                        pos += 128;
 215
 216                        if (pos <= offset) {
 217                                len = 0;
 218                                continue;
 219                        }
 220                        sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 221                                r->u.dst.dev ? r->u.dst.dev->name : "*",
 222                                (unsigned long)r->rt_dst,
 223                                (unsigned long)r->rt_gateway,
 224                                r->rt_flags,
 225                                atomic_read(&r->u.dst.use),
 226                                atomic_read(&r->u.dst.refcnt),
 227                                0,
 228                                (unsigned long)r->rt_src, (int)r->u.dst.pmtu,
 229                                r->u.dst.window,
 230                                (int)r->u.dst.rtt, r->key.tos,
 231                                r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 232                                r->u.dst.hh ? (r->u.dst.hh->hh_output == dev_queue_xmit) : 0,
 233                                r->rt_spec_dst);
 234                        sprintf(buffer+len,"%-127s\n",temp);
 235                        len += 128;
 236                        if (pos >= offset+length)
 237                                goto done;
 238                }
 239        }
 240
 241done:
 242        end_bh_atomic();
 243        
 244        *start = buffer+len-(pos-offset);
 245        len = pos-offset;
 246        if (len>length)
 247                len = length;
 248        return len;
 249}
 250#endif
 251  
 252static __inline__ void rt_free(struct rtable *rt)
 253{
 254        dst_free(&rt->u.dst);
 255}
 256
 257static __inline__ void rt_drop(struct rtable *rt)
 258{
 259        ip_rt_put(rt);
 260        dst_free(&rt->u.dst);
 261}
 262
 263static __inline__ int rt_fast_clean(struct rtable *rth)
 264{
 265        /* Kill broadcast/multicast entries very aggresively, if they
 266           collide in hash table with more useful entries */
 267        return ((rth->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST))
 268                && rth->key.iif && rth->u.rt_next);
 269}
 270
 271static __inline__ int rt_valuable(struct rtable *rth)
 272{
 273        return ((rth->rt_flags&(RTCF_REDIRECTED|RTCF_NOTIFY))
 274                || rth->u.dst.expires);
 275}
 276
 277static __inline__ int rt_may_expire(struct rtable *rth, int tmo1, int tmo2)
 278{
 279        int age;
 280
 281        if (atomic_read(&rth->u.dst.use))
 282                return 0;
 283
 284        age = jiffies - rth->u.dst.lastuse;
 285        if (age <= tmo1 && !rt_fast_clean(rth))
 286                return 0;
 287        if (age <= tmo2 && rt_valuable(rth))
 288                return 0;
 289        return 1;
 290}
 291
 292static void rt_check_expire(unsigned long dummy)
 293{
 294        int i;
 295        static int rover;
 296        struct rtable *rth, **rthp;
 297        unsigned long now = jiffies;
 298
 299        for (i=0; i<RT_HASH_DIVISOR/5; i++) {
 300                unsigned tmo = ip_rt_gc_timeout;
 301
 302                rover = (rover + 1) & (RT_HASH_DIVISOR-1);
 303                rthp = &rt_hash_table[rover];
 304
 305                while ((rth = *rthp) != NULL) {
 306                        if (rth->u.dst.expires) {
 307                                /* Entrie is expired even if it is in use */
 308                                if ((long)(now - rth->u.dst.expires) < tmo) {
 309                                        tmo >>= 1;
 310                                        rthp = &rth->u.rt_next;
 311                                        continue;
 312                                }
 313                        } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 314                                tmo >>= 1;
 315                                rthp = &rth->u.rt_next;
 316                                continue;
 317                        }
 318
 319                        /*
 320                         * Cleanup aged off entries.
 321                         */
 322                        *rthp = rth->u.rt_next;
 323                        rt_free(rth);
 324                }
 325
 326                /* Fallback loop breaker. */
 327                if ((jiffies - now) > 0)
 328                        break;
 329        }
 330        rt_periodic_timer.expires = now + ip_rt_gc_interval;
 331        add_timer(&rt_periodic_timer);
 332}
 333
 334static void rt_run_flush(unsigned long dummy)
 335{
 336        int i;
 337        struct rtable * rth, * next;
 338
 339        rt_deadline = 0;
 340
 341        start_bh_atomic();
 342        for (i=0; i<RT_HASH_DIVISOR; i++) {
 343                if ((rth = xchg(&rt_hash_table[i], NULL)) == NULL)
 344                        continue;
 345                end_bh_atomic();
 346
 347                for (; rth; rth=next) {
 348                        next = rth->u.rt_next;
 349                        rth->u.rt_next = NULL;
 350                        rt_free(rth);
 351                }
 352
 353                start_bh_atomic();
 354        }
 355        end_bh_atomic();
 356}
 357  
 358void rt_cache_flush(int delay)
 359{
 360        unsigned long now = jiffies;
 361        int user_mode = !in_interrupt();
 362
 363        if (delay < 0)
 364                delay = ip_rt_min_delay;
 365
 366        start_bh_atomic();
 367
 368        if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
 369                long tmo = (long)(rt_deadline - now);
 370
 371                /* If flush timer is already running
 372                   and flush request is not immediate (delay > 0):
 373
 374                   if deadline is not achieved, prolongate timer to "delay",
 375                   otherwise fire it at deadline time.
 376                 */
 377
 378                if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
 379                        tmo = 0;
 380                
 381                if (delay > tmo)
 382                        delay = tmo;
 383        }
 384
 385        if (delay <= 0) {
 386                end_bh_atomic();
 387                rt_run_flush(0);
 388                return;
 389        }
 390
 391        if (rt_deadline == 0)
 392                rt_deadline = now + ip_rt_max_delay;
 393
 394        rt_flush_timer.expires = now + delay;
 395        add_timer(&rt_flush_timer);
 396        end_bh_atomic();
 397}
 398
 399/*
 400   Short description of GC goals.
 401
 402   We want to build algorithm, which will keep routing cache
 403   at some equilibrium point, when number of aged off entries
 404   is kept approximately equal to newly generated ones.
 405
 406   Current expiration strength is variable "expire".
 407   We try to adjust it dynamically, so that if networking
 408   is idle expires is large enough to keep enough of warm entries,
 409   and when load increases it reduces to limit cache size.
 410 */
 411
 412static int rt_garbage_collect(void)
 413{
 414        static unsigned expire = RT_GC_TIMEOUT;
 415        static unsigned long last_gc;
 416        static int rover;
 417        static int equilibrium;
 418        struct rtable *rth, **rthp;
 419        unsigned long now = jiffies;
 420        int goal;
 421
 422        /*
 423         * Garbage collection is pretty expensive,
 424         * do not make it too frequently.
 425         */
 426        if (now - last_gc < ip_rt_gc_min_interval &&
 427            atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 428                return 0;
 429
 430        /* Calculate number of entries, which we want to expire now. */
 431        goal = atomic_read(&ipv4_dst_ops.entries) - RT_HASH_DIVISOR*ip_rt_gc_elasticity;
 432        if (goal <= 0) {
 433                if (equilibrium < ipv4_dst_ops.gc_thresh)
 434                        equilibrium = ipv4_dst_ops.gc_thresh;
 435                goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 436                if (goal > 0) {
 437                        equilibrium += min(goal/2, RT_HASH_DIVISOR);
 438                        goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 439                }
 440        } else {
 441                /* We are in dangerous area. Try to reduce cache really
 442                 * aggressively.
 443                 */
 444                goal = max(goal/2, RT_HASH_DIVISOR);
 445                equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 446        }
 447
 448        if (now - last_gc >= ip_rt_gc_min_interval)
 449                last_gc = now;
 450
 451        if (goal <= 0) {
 452                equilibrium += goal;
 453                goto work_done;
 454        }
 455
 456        do {
 457                int i, k;
 458
 459                start_bh_atomic();
 460                for (i=0, k=rover; i<RT_HASH_DIVISOR; i++) {
 461                        unsigned tmo = expire;
 462
 463                        k = (k + 1) & (RT_HASH_DIVISOR-1);
 464                        rthp = &rt_hash_table[k];
 465                        while ((rth = *rthp) != NULL) {
 466                                if (!rt_may_expire(rth, tmo, expire)) {
 467                                        tmo >>= 1;
 468                                        rthp = &rth->u.rt_next;
 469                                        continue;
 470                                }
 471                                *rthp = rth->u.rt_next;
 472                                rth->u.rt_next = NULL;
 473                                rt_free(rth);
 474                                goal--;
 475                        }
 476                        if (goal <= 0)
 477                                break;
 478                }
 479                rover = k;
 480                end_bh_atomic();
 481
 482                if (goal <= 0)
 483                        goto work_done;
 484
 485                /* Goal is not achieved. We stop process if:
 486
 487                   - if expire reduced to zero. Otherwise, expire is halfed.
 488                   - if table is not full.
 489                   - if we are called from interrupt.
 490                   - jiffies check is just fallback/debug loop breaker.
 491                     We will not spin here for long time in any case.
 492                 */
 493
 494                if (expire == 0)
 495                        break;
 496
 497                expire >>= 1;
 498#if RT_CACHE_DEBUG >= 2
 499                printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, i);
 500#endif
 501
 502                if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 503                        return 0;
 504        } while (!in_interrupt() && jiffies - now < 1);
 505
 506        if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 507                return 0;
 508        if (net_ratelimit())
 509                printk("dst cache overflow\n");
 510        return 1;
 511
 512work_done:
 513        expire += ip_rt_gc_min_interval;
 514        if (expire > ip_rt_gc_timeout ||
 515            atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 516                expire = ip_rt_gc_timeout;
 517#if RT_CACHE_DEBUG >= 2
 518        printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, rover);
 519#endif
 520        return 0;
 521}
 522
 523static int rt_intern_hash(unsigned hash, struct rtable * rt, struct rtable ** rp)
 524{
 525        struct rtable   *rth, **rthp;
 526        unsigned long   now = jiffies;
 527        int attempts = !in_interrupt();
 528
 529restart:
 530        start_bh_atomic();
 531
 532        rthp = &rt_hash_table[hash];
 533
 534        while ((rth = *rthp) != NULL) {
 535                if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
 536                        /* Put it first */
 537                        *rthp = rth->u.rt_next;
 538                        rth->u.rt_next = rt_hash_table[hash];
 539                        rt_hash_table[hash] = rth;
 540
 541                        atomic_inc(&rth->u.dst.refcnt);
 542                        atomic_inc(&rth->u.dst.use);
 543                        rth->u.dst.lastuse = now;
 544                        end_bh_atomic();
 545
 546                        rt_drop(rt);
 547                        *rp = rth;
 548                        return 0;
 549                }
 550
 551                rthp = &rth->u.rt_next;
 552        }
 553
 554        /* Try to bind route to arp only if it is output
 555           route or unicast forwarding path.
 556         */
 557        if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
 558                if (!arp_bind_neighbour(&rt->u.dst)) {
 559                        end_bh_atomic();
 560
 561                        /* Neighbour tables are full and nothing
 562                           can be released. Try to shrink route cache,
 563                           it is most likely it holds some neighbour records.
 564                         */
 565                        if (attempts-- > 0) {
 566                                int saved_elasticity = ip_rt_gc_elasticity;
 567                                ip_rt_gc_elasticity = 1;
 568                                rt_garbage_collect();
 569                                ip_rt_gc_elasticity = saved_elasticity;
 570                                goto restart;
 571                        }
 572
 573                        rt_drop(rt);
 574                        if (net_ratelimit())
 575                                printk("neighbour table overflow\n");
 576                        return -ENOBUFS;
 577                }
 578        }
 579
 580        rt->u.rt_next = rt_hash_table[hash];
 581#if RT_CACHE_DEBUG >= 2
 582        if (rt->u.rt_next) {
 583                struct rtable * trt;
 584                printk("rt_cache @%02x: %08x", hash, rt->rt_dst);
 585                for (trt=rt->u.rt_next; trt; trt=trt->u.rt_next)
 586                        printk(" . %08x", trt->rt_dst);
 587                printk("\n");
 588        }
 589#endif
 590        rt_hash_table[hash] = rt;
 591        end_bh_atomic();
 592        *rp = rt;
 593        return 0;
 594}
 595
 596void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
 597                    u32 saddr, u8 tos, struct device *dev)
 598{
 599        int i, k;
 600        struct in_device *in_dev = dev->ip_ptr;
 601        struct rtable *rth, **rthp;
 602        u32  skeys[2] = { saddr, 0 };
 603        int  ikeys[2] = { dev->ifindex, 0 };
 604
 605        tos &= IPTOS_TOS_MASK;
 606
 607        if (!in_dev)
 608                return;
 609
 610        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
 611            || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
 612                goto reject_redirect;
 613
 614        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 615                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 616                        goto reject_redirect;
 617                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 618                        goto reject_redirect;
 619        } else {
 620                if (inet_addr_type(new_gw) != RTN_UNICAST)
 621                        goto reject_redirect;
 622        }
 623
 624        for (i=0; i<2; i++) {
 625                for (k=0; k<2; k++) {
 626                        unsigned hash = rt_hash_code(daddr, skeys[i]^(ikeys[k]<<5), tos);
 627
 628                        rthp=&rt_hash_table[hash];
 629
 630                        while ( (rth = *rthp) != NULL) {
 631                                struct rtable *rt;
 632
 633                                if (rth->key.dst != daddr ||
 634                                    rth->key.src != skeys[i] ||
 635                                    rth->key.tos != tos ||
 636                                    rth->key.oif != ikeys[k] ||
 637                                    rth->key.iif != 0) {
 638                                        rthp = &rth->u.rt_next;
 639                                        continue;
 640                                }
 641
 642                                if (rth->rt_dst != daddr ||
 643                                    rth->rt_src != saddr ||
 644                                    rth->u.dst.error ||
 645                                    rth->rt_gateway != old_gw ||
 646                                    rth->u.dst.dev != dev)
 647                                        break;
 648
 649                                dst_clone(&rth->u.dst);
 650
 651                                rt = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
 652                                if (rt == NULL) {
 653                                        ip_rt_put(rth);
 654                                        return;
 655                                }
 656
 657                                /*
 658                                 * Copy all the information.
 659                                 */
 660                                *rt = *rth;
 661                                atomic_set(&rt->u.dst.refcnt, 1);
 662                                atomic_set(&rt->u.dst.use, 1);
 663                                rt->u.dst.lastuse = jiffies;
 664                                rt->u.dst.neighbour = NULL;
 665                                rt->u.dst.hh = NULL;
 666
 667                                rt->rt_flags |= RTCF_REDIRECTED;
 668
 669                                /* Gateway is different ... */
 670                                rt->rt_gateway = new_gw;
 671
 672                                /* Redirect received -> path was valid */
 673                                dst_confirm(&rth->u.dst);
 674
 675                                if (!arp_bind_neighbour(&rt->u.dst) ||
 676                                    !(rt->u.dst.neighbour->nud_state&NUD_VALID)) {
 677                                        if (rt->u.dst.neighbour)
 678                                                neigh_event_send(rt->u.dst.neighbour, NULL);
 679                                        ip_rt_put(rth);
 680                                        rt_drop(rt);
 681                                        break;
 682                                }
 683
 684                                *rthp = rth->u.rt_next;
 685                                if (!rt_intern_hash(hash, rt, &rt))
 686                                        ip_rt_put(rt);
 687                                rt_drop(rth);
 688                                break;
 689                        }
 690                }
 691        }
 692        return;
 693
 694reject_redirect:
 695#ifdef CONFIG_IP_ROUTE_VERBOSE
 696        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
 697                printk(KERN_INFO "Redirect from %lX/%s to %lX ignored."
 698                       "Path = %lX -> %lX, tos %02x\n",
 699                       ntohl(old_gw), dev->name, ntohl(new_gw),
 700                       ntohl(saddr), ntohl(daddr), tos);
 701#endif
 702}
 703
 704static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 705{
 706        struct rtable *rt = (struct rtable*)dst;
 707
 708        if (rt != NULL) {
 709                if (dst->obsolete) {
 710                        ip_rt_put(rt);
 711                        return NULL;
 712                }
 713                if ((rt->rt_flags&RTCF_REDIRECTED) || rt->u.dst.expires) {
 714                        unsigned hash = rt_hash_code(rt->key.dst, rt->key.src^(rt->key.oif<<5), rt->key.tos);
 715                        struct rtable **rthp;
 716#if RT_CACHE_DEBUG >= 1
 717                        printk(KERN_DEBUG "ip_rt_advice: redirect to %d.%d.%d.%d/%02x dropped\n", NIPQUAD(rt->rt_dst), rt->key.tos);
 718#endif
 719                        start_bh_atomic();
 720                        ip_rt_put(rt);
 721                        for (rthp = &rt_hash_table[hash]; *rthp; rthp = &(*rthp)->u.rt_next) {
 722                                if (*rthp == rt) {
 723                                        *rthp = rt->u.rt_next;
 724                                        rt_free(rt);
 725                                        break;
 726                                }
 727                        }
 728                        end_bh_atomic();
 729                        return NULL;
 730                }
 731        }
 732        return dst;
 733}
 734
 735/*
 736 * Algorithm:
 737 *      1. The first ip_rt_redirect_number redirects are sent
 738 *         with exponential backoff, then we stop sending them at all,
 739 *         assuming that the host ignores our redirects.
 740 *      2. If we did not see packets requiring redirects
 741 *         during ip_rt_redirect_silence, we assume that the host
 742 *         forgot redirected route and start to send redirects again.
 743 *
 744 * This algorithm is much cheaper and more intelligent than dumb load limiting
 745 * in icmp.c.
 746 *
 747 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 748 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 749 */
 750
 751void ip_rt_send_redirect(struct sk_buff *skb)
 752{
 753        struct rtable *rt = (struct rtable*)skb->dst;
 754        struct in_device *in_dev = (struct in_device*)rt->u.dst.dev->ip_ptr;
 755
 756        if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev))
 757                return;
 758
 759        /* No redirected packets during ip_rt_redirect_silence;
 760         * reset the algorithm.
 761         */
 762        if (jiffies - rt->u.dst.rate_last > ip_rt_redirect_silence)
 763                rt->u.dst.rate_tokens = 0;
 764
 765        /* Too many ignored redirects; do not send anything
 766         * set u.dst.rate_last to the last seen redirected packet.
 767         */
 768        if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
 769                rt->u.dst.rate_last = jiffies;
 770                return;
 771        }
 772
 773        /* Check for load limit; set rate_last to the latest sent
 774         * redirect.
 775         */
 776        if (jiffies - rt->u.dst.rate_last > (ip_rt_redirect_load<<rt->u.dst.rate_tokens)) {
 777                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
 778                rt->u.dst.rate_last = jiffies;
 779                ++rt->u.dst.rate_tokens;
 780#ifdef CONFIG_IP_ROUTE_VERBOSE
 781                if (IN_DEV_LOG_MARTIANS(in_dev) &&
 782                    rt->u.dst.rate_tokens == ip_rt_redirect_number && net_ratelimit())
 783                        printk(KERN_WARNING "host %08x/if%d ignores redirects for %08x to %08x.\n",
 784                               rt->rt_src, rt->rt_iif, rt->rt_dst, rt->rt_gateway);
 785#endif
 786        }
 787}
 788
 789static int ip_error(struct sk_buff *skb)
 790{
 791        struct rtable *rt = (struct rtable*)skb->dst;
 792        unsigned long now;
 793        int code;
 794
 795        switch (rt->u.dst.error) {
 796        case EINVAL:
 797        default:
 798                kfree_skb(skb);
 799                return 0;
 800        case EHOSTUNREACH:
 801                code = ICMP_HOST_UNREACH;
 802                break;
 803        case ENETUNREACH:
 804                code = ICMP_NET_UNREACH;
 805                break;
 806        case EACCES:
 807                code = ICMP_PKT_FILTERED;
 808                break;
 809        }
 810
 811        now = jiffies;
 812        if ((rt->u.dst.rate_tokens += (now - rt->u.dst.rate_last)) > ip_rt_error_burst)
 813                rt->u.dst.rate_tokens = ip_rt_error_burst;
 814        rt->u.dst.rate_last = now;
 815        if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
 816                rt->u.dst.rate_tokens -= ip_rt_error_cost;
 817                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 818        }
 819
 820        kfree_skb(skb);
 821        return 0;
 822} 
 823
 824/*
 825 *      The last two values are not from the RFC but
 826 *      are needed for AMPRnet AX.25 paths.
 827 */
 828
 829static unsigned short mtu_plateau[] =
 830{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
 831
 832static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
 833{
 834        int i;
 835        
 836        for (i = 0; i < sizeof(mtu_plateau)/sizeof(mtu_plateau[0]); i++)
 837                if (old_mtu > mtu_plateau[i])
 838                        return mtu_plateau[i];
 839        return 68;
 840}
 841
 842unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
 843{
 844        int i;
 845        unsigned short old_mtu = ntohs(iph->tot_len);
 846        struct rtable *rth;
 847        u32  skeys[2] = { iph->saddr, 0, };
 848        u32  daddr = iph->daddr;
 849        u8   tos = iph->tos & IPTOS_TOS_MASK;
 850        unsigned short est_mtu = 0;
 851
 852        if (ipv4_config.no_pmtu_disc)
 853                return 0;
 854
 855        for (i=0; i<2; i++) {
 856                unsigned hash = rt_hash_code(daddr, skeys[i], tos);
 857
 858                for (rth = rt_hash_table[hash]; rth; rth = rth->u.rt_next) {
 859                        if (rth->key.dst == daddr &&
 860                            rth->key.src == skeys[i] &&
 861                            rth->rt_dst == daddr &&
 862                            rth->rt_src == iph->saddr &&
 863                            rth->key.tos == tos &&
 864                            rth->key.iif == 0 &&
 865                            !(rth->u.dst.mxlock&(1<<RTAX_MTU))) {
 866                                unsigned short mtu = new_mtu;
 867
 868                                if (new_mtu < 68 || new_mtu >= old_mtu) {
 869
 870                                        /* BSD 4.2 compatibility hack :-( */
 871                                        if (mtu == 0 && old_mtu >= rth->u.dst.pmtu &&
 872                                            old_mtu >= 68 + (iph->ihl<<2))
 873                                                old_mtu -= iph->ihl<<2;
 874
 875                                        mtu = guess_mtu(old_mtu);
 876                                }
 877                                if (mtu <= rth->u.dst.pmtu) {
 878                                        if (mtu < rth->u.dst.pmtu) { 
 879                                                dst_confirm(&rth->u.dst);
 880                                                rth->u.dst.pmtu = mtu;
 881                                                dst_set_expires(&rth->u.dst, ip_rt_mtu_expires);
 882                                        }
 883                                        est_mtu = mtu;
 884                                }
 885                        }
 886                }
 887        }
 888        return est_mtu;
 889}
 890
 891static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32 cookie)
 892{
 893        dst_release(dst);
 894        return NULL;
 895}
 896
 897static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst,
 898                                           struct sk_buff *skb)
 899{
 900        return NULL;
 901}
 902
 903static void ipv4_link_failure(struct sk_buff *skb)
 904{
 905        struct rtable *rt;
 906
 907        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
 908
 909        rt = (struct rtable *) skb->dst;
 910        if (rt)
 911                dst_set_expires(&rt->u.dst, 0);
 912}
 913
 914static int ip_rt_bug(struct sk_buff *skb)
 915{
 916        printk(KERN_DEBUG "ip_rt_bug: %08x -> %08x, %s\n", skb->nh.iph->saddr,
 917               skb->nh.iph->daddr, skb->dev ? skb->dev->name : "?");
 918        kfree_skb(skb);
 919        return 0;
 920}
 921
 922/*
 923   We do not cache source address of outgoing interface,
 924   because it is used only by IP RR, TS and SRR options,
 925   so that it out of fast path.
 926
 927   BTW remember: "addr" is allowed to be not aligned
 928   in IP options!
 929 */
 930
 931void ip_rt_get_source(u8 *addr, struct rtable *rt)
 932{
 933        u32 src;
 934        struct fib_result res;
 935
 936        if (rt->key.iif == 0)
 937                src = rt->rt_src;
 938        else if (fib_lookup(&rt->key, &res) == 0)
 939                src = FIB_RES_PREFSRC(res);
 940        else
 941                src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE);
 942        memcpy(addr, &src, 4);
 943}
 944
 945#ifdef CONFIG_NET_CLS_ROUTE
 946static void set_class_tag(struct rtable *rt, u32 tag)
 947{
 948        if (!(rt->u.dst.tclassid&0xFFFF))
 949                rt->u.dst.tclassid |= tag&0xFFFF;
 950        if (!(rt->u.dst.tclassid&0xFFFF0000))
 951                rt->u.dst.tclassid |= tag&0xFFFF0000;
 952}
 953#endif
 954
 955static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
 956{
 957        struct fib_info *fi = res->fi;
 958
 959        if (fi) {
 960                if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
 961                        rt->rt_gateway = FIB_RES_GW(*res);
 962                rt->u.dst.mxlock = fi->fib_metrics[RTAX_LOCK-1];
 963                rt->u.dst.pmtu = fi->fib_mtu;
 964                if (fi->fib_mtu == 0) {
 965                        rt->u.dst.pmtu = rt->u.dst.dev->mtu;
 966                        if (rt->u.dst.pmtu > IP_MAX_MTU)
 967                                rt->u.dst.pmtu = IP_MAX_MTU;
 968                        if (rt->u.dst.mxlock&(1<<RTAX_MTU) &&
 969                            rt->rt_gateway != rt->rt_dst &&
 970                            rt->u.dst.pmtu > 576)
 971                                rt->u.dst.pmtu = 576;
 972                }
 973                rt->u.dst.window= fi->fib_window ? : 0;
 974                rt->u.dst.rtt   = fi->fib_rtt ? : TCP_TIMEOUT_INIT;
 975#ifdef CONFIG_NET_CLS_ROUTE
 976                rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
 977#endif
 978        } else {
 979                rt->u.dst.pmtu  = rt->u.dst.dev->mtu;
 980                if (rt->u.dst.pmtu > IP_MAX_MTU)
 981                        rt->u.dst.pmtu = IP_MAX_MTU;
 982                rt->u.dst.window= 0;
 983                rt->u.dst.rtt   = TCP_TIMEOUT_INIT;
 984        }
 985#ifdef CONFIG_NET_CLS_ROUTE
 986#ifdef CONFIG_IP_MULTIPLE_TABLES
 987        set_class_tag(rt, fib_rules_tclass(res));
 988#endif
 989        set_class_tag(rt, itag);
 990#endif
 991        rt->rt_type = res->type;
 992}
 993
 994static int
 995ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
 996                  u8 tos, struct device *dev, int our)
 997{
 998        unsigned hash;
 999        struct rtable *rth;
1000        u32 spec_dst;
1001        struct in_device *in_dev = dev->ip_ptr;
1002        u32 itag = 0;
1003
1004        /* Primary sanity checks. */
1005
1006        if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1007            in_dev == NULL || skb->protocol != __constant_htons(ETH_P_IP))
1008                return -EINVAL;
1009
1010        if (ZERONET(saddr)) {
1011                if (!LOCAL_MCAST(daddr))
1012                        return -EINVAL;
1013                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1014        } else if (fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag) < 0)
1015                return -EINVAL;
1016
1017        rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
1018        if (!rth)
1019                return -ENOBUFS;
1020
1021        rth->u.dst.output= ip_rt_bug;
1022
1023        atomic_set(&rth->u.dst.use, 1);
1024        rth->key.dst    = daddr;
1025        rth->rt_dst     = daddr;
1026        rth->key.tos    = tos;
1027#ifdef CONFIG_IP_ROUTE_FWMARK
1028        rth->key.fwmark = skb->fwmark;
1029#endif
1030        rth->key.src    = saddr;
1031        rth->rt_src     = saddr;
1032#ifdef CONFIG_IP_ROUTE_NAT
1033        rth->rt_dst_map = daddr;
1034        rth->rt_src_map = saddr;
1035#endif
1036#ifdef CONFIG_NET_CLS_ROUTE
1037        rth->u.dst.tclassid = itag;
1038#endif
1039        rth->rt_iif     =
1040        rth->key.iif    = dev->ifindex;
1041        rth->u.dst.dev  = &loopback_dev;
1042        rth->key.oif    = 0;
1043        rth->rt_gateway = daddr;
1044        rth->rt_spec_dst= spec_dst;
1045        rth->rt_type    = RTN_MULTICAST;
1046        rth->rt_flags   = RTCF_MULTICAST;
1047        if (our) {
1048                rth->u.dst.input= ip_local_deliver;
1049                rth->rt_flags |= RTCF_LOCAL;
1050        }
1051
1052#ifdef CONFIG_IP_MROUTE
1053        if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1054                rth->u.dst.input = ip_mr_input;
1055#endif
1056
1057        hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos);
1058        return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1059}
1060
1061/*
1062 *      NOTE. We drop all the packets that has local source
1063 *      addresses, because every properly looped back packet
1064 *      must have correct destination already attached by output routine.
1065 *
1066 *      Such approach solves two big problems:
1067 *      1. Not simplex devices are handled properly.
1068 *      2. IP spoofing attempts are filtered with 100% of guarantee.
1069 */
1070
1071int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1072                        u8 tos, struct device *dev)
1073{
1074        struct rt_key   key;
1075        struct fib_result res;
1076        struct in_device *in_dev = dev->ip_ptr;
1077        struct in_device *out_dev;
1078        unsigned        flags = 0;
1079        u32             itag = 0;
1080        struct rtable * rth;
1081        unsigned        hash;
1082        u32             spec_dst;
1083        int             err = -EINVAL;
1084
1085        /*
1086         *      IP on this device is disabled.
1087         */
1088
1089        if (!in_dev)
1090                return -EINVAL;
1091
1092        key.dst = daddr;
1093        key.src = saddr;
1094        key.tos = tos;
1095#ifdef CONFIG_IP_ROUTE_FWMARK
1096        key.fwmark = skb->fwmark;
1097#endif
1098        key.iif = dev->ifindex;
1099        key.oif = 0;
1100        key.scope = RT_SCOPE_UNIVERSE;
1101
1102        hash = rt_hash_code(daddr, saddr^(key.iif<<5), tos);
1103
1104        /* Check for the most weird martians, which can be not detected
1105           by fib_lookup.
1106         */
1107
1108        if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1109                goto martian_source;
1110
1111        if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1112                goto brd_input;
1113
1114        /* Accept zero addresses only to limited broadcast;
1115         * I even do not know to fix it or not. Waiting for complains :-)
1116         */
1117        if (ZERONET(saddr))
1118                goto martian_source;
1119
1120        if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1121                goto martian_destination;
1122
1123        /*
1124         *      Now we are ready to route packet.
1125         */
1126        if ((err = fib_lookup(&key, &res))) {
1127                if (!IN_DEV_FORWARD(in_dev))
1128                        return -EINVAL;
1129                goto no_route;
1130        }
1131
1132#ifdef CONFIG_IP_ROUTE_NAT
1133        /* Policy is applied before mapping destination,
1134           but rerouting after map should be made with old source.
1135         */
1136
1137        if (1) {
1138                u32 src_map = saddr;
1139                if (res.r)
1140                        src_map = fib_rules_policy(saddr, &res, &flags);
1141
1142                if (res.type == RTN_NAT) {
1143                        key.dst = fib_rules_map_destination(daddr, &res);
1144                        if (fib_lookup(&key, &res) || res.type != RTN_UNICAST)
1145                                return -EINVAL;
1146                        flags |= RTCF_DNAT;
1147                }
1148                key.src = src_map;
1149        }
1150#endif
1151
1152        if (res.type == RTN_BROADCAST)
1153                goto brd_input;
1154
1155        if (res.type == RTN_LOCAL) {
1156                int result;
1157                result = fib_validate_source(saddr, daddr, tos, loopback_dev.ifindex,
1158                                             dev, &spec_dst, &itag);
1159                if (result < 0)
1160                        goto martian_source;
1161                if (result)
1162                        flags |= RTCF_DIRECTSRC;
1163                spec_dst = daddr;
1164                goto local_input;
1165        }
1166
1167        if (!IN_DEV_FORWARD(in_dev))
1168                return -EINVAL;
1169        if (res.type != RTN_UNICAST)
1170                goto martian_destination;
1171
1172#ifdef CONFIG_IP_ROUTE_MULTIPATH
1173        if (res.fi->fib_nhs > 1 && key.oif == 0)
1174                fib_select_multipath(&key, &res);
1175#endif
1176        out_dev = FIB_RES_DEV(res)->ip_ptr;
1177        if (out_dev == NULL) {
1178                if (net_ratelimit())
1179                        printk(KERN_CRIT "Bug in ip_route_input_slow(). Please, report\n");
1180                return -EINVAL;
1181        }
1182
1183        err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, &spec_dst, &itag);
1184        if (err < 0)
1185                goto martian_source;
1186
1187        if (err)
1188                flags |= RTCF_DIRECTSRC;
1189
1190        if (out_dev == in_dev && err && !(flags&(RTCF_NAT|RTCF_MASQ)) &&
1191            (IN_DEV_SHARED_MEDIA(out_dev)
1192             || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1193                flags |= RTCF_DOREDIRECT;
1194
1195        if (skb->protocol != __constant_htons(ETH_P_IP)) {
1196                /* Not IP (i.e. ARP). Do not create route, if it is
1197                 * invalid for proxy arp. DNAT routes are always valid.
1198                 */
1199                if (out_dev == in_dev && !(flags&RTCF_DNAT))
1200                        return -EINVAL;
1201        }
1202
1203        rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
1204        if (!rth)
1205                return -ENOBUFS;
1206
1207        atomic_set(&rth->u.dst.use, 1);
1208        rth->key.dst    = daddr;
1209        rth->rt_dst     = daddr;
1210        rth->key.tos    = tos;
1211#ifdef CONFIG_IP_ROUTE_FWMARK
1212        rth->key.fwmark = skb->fwmark;
1213#endif
1214        rth->key.src    = saddr;
1215        rth->rt_src     = saddr;
1216        rth->rt_gateway = daddr;
1217#ifdef CONFIG_IP_ROUTE_NAT
1218        rth->rt_src_map = key.src;
1219        rth->rt_dst_map = key.dst;
1220        if (flags&RTCF_DNAT)
1221                rth->rt_gateway = key.dst;
1222#endif
1223        rth->rt_iif     =
1224        rth->key.iif    = dev->ifindex;
1225        rth->u.dst.dev  = out_dev->dev;
1226        rth->key.oif    = 0;
1227        rth->rt_spec_dst= spec_dst;
1228
1229        rth->u.dst.input = ip_forward;
1230        rth->u.dst.output = ip_output;
1231
1232        rt_set_nexthop(rth, &res, itag);
1233
1234        rth->rt_flags = flags;
1235
1236#ifdef CONFIG_NET_FASTROUTE
1237        if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
1238                struct device *odev = rth->u.dst.dev;
1239                if (odev != dev &&
1240                    dev->accept_fastpath &&
1241                    odev->mtu >= dev->mtu &&
1242                    dev->accept_fastpath(dev, &rth->u.dst) == 0)
1243                        rth->rt_flags |= RTCF_FAST;
1244        }
1245#endif
1246
1247        return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1248
1249brd_input:
1250        if (skb->protocol != __constant_htons(ETH_P_IP))
1251                return -EINVAL;
1252
1253        if (ZERONET(saddr)) {
1254                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1255        } else {
1256                err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag);
1257                if (err < 0)
1258                        goto martian_source;
1259                if (err)
1260                        flags |= RTCF_DIRECTSRC;
1261        }
1262        flags |= RTCF_BROADCAST;
1263        res.type = RTN_BROADCAST;
1264
1265local_input:
1266        rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
1267        if (!rth)
1268                return -ENOBUFS;
1269
1270        rth->u.dst.output= ip_rt_bug;
1271
1272        atomic_set(&rth->u.dst.use, 1);
1273        rth->key.dst    = daddr;
1274        rth->rt_dst     = daddr;
1275        rth->key.tos    = tos;
1276#ifdef CONFIG_IP_ROUTE_FWMARK
1277        rth->key.fwmark = skb->fwmark;
1278#endif
1279        rth->key.src    = saddr;
1280        rth->rt_src     = saddr;
1281#ifdef CONFIG_IP_ROUTE_NAT
1282        rth->rt_dst_map = key.dst;
1283        rth->rt_src_map = key.src;
1284#endif
1285#ifdef CONFIG_NET_CLS_ROUTE
1286        rth->u.dst.tclassid = itag;
1287#endif
1288        rth->rt_iif     =
1289        rth->key.iif    = dev->ifindex;
1290        rth->u.dst.dev  = &loopback_dev;
1291        rth->key.oif    = 0;
1292        rth->rt_gateway = daddr;
1293        rth->rt_spec_dst= spec_dst;
1294        rth->u.dst.input= ip_local_deliver;
1295        rth->rt_flags   = flags|RTCF_LOCAL;
1296        if (res.type == RTN_UNREACHABLE) {
1297                rth->u.dst.input= ip_error;
1298                rth->u.dst.error= -err;
1299                rth->rt_flags   &= ~RTCF_LOCAL;
1300        }
1301        rth->rt_type    = res.type;
1302        return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1303
1304no_route:
1305        spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1306        res.type = RTN_UNREACHABLE;
1307        goto local_input;
1308
1309        /*
1310         *      Do not cache martian addresses: they should be logged (RFC1812)
1311         */
1312martian_destination:
1313#ifdef CONFIG_IP_ROUTE_VERBOSE
1314        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1315                printk(KERN_WARNING "martian destination %08x from %08x, dev %s\n", daddr, saddr, dev->name);
1316#endif
1317        return -EINVAL;
1318
1319martian_source:
1320#ifdef CONFIG_IP_ROUTE_VERBOSE
1321        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1322                /*
1323                 *      RFC1812 recommenadtion, if source is martian,
1324                 *      the only hint is MAC header.
1325                 */
1326                printk(KERN_WARNING "martian source %08x for %08x, dev %s\n", saddr, daddr, dev->name);
1327                if (dev->hard_header_len) {
1328                        int i;
1329                        unsigned char *p = skb->mac.raw;
1330                        printk(KERN_WARNING "ll header:");
1331                        for (i=0; i<dev->hard_header_len; i++, p++)
1332                                printk(" %02x", *p);
1333                        printk("\n");
1334                }
1335        }
1336#endif
1337        return -EINVAL;
1338}
1339
1340int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1341                   u8 tos, struct device *dev)
1342{
1343        struct rtable * rth;
1344        unsigned        hash;
1345        int iif = dev->ifindex;
1346
1347        tos &= IPTOS_TOS_MASK;
1348        hash = rt_hash_code(daddr, saddr^(iif<<5), tos);
1349
1350        for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
1351                if (rth->key.dst == daddr &&
1352                    rth->key.src == saddr &&
1353                    rth->key.iif == iif &&
1354                    rth->key.oif == 0 &&
1355#ifdef CONFIG_IP_ROUTE_FWMARK
1356                    rth->key.fwmark == skb->fwmark &&
1357#endif
1358                    rth->key.tos == tos) {
1359                        rth->u.dst.lastuse = jiffies;
1360                        atomic_inc(&rth->u.dst.use);
1361                        atomic_inc(&rth->u.dst.refcnt);
1362                        skb->dst = (struct dst_entry*)rth;
1363                        return 0;
1364                }
1365        }
1366
1367        /* Multicast recognition logic is moved from route cache to here.
1368           The problem was that too many Ethernet cards have broken/missing
1369           hardware multicast filters :-( As result the host on multicasting
1370           network acquires a lot of useless route cache entries, sort of
1371           SDR messages from all the world. Now we try to get rid of them.
1372           Really, provided software IP multicast filter is organized
1373           reasonably (at least, hashed), it does not result in a slowdown
1374           comparing with route cache reject entries.
1375           Note, that multicast routers are not affected, because
1376           route cache entry is created eventually.
1377         */
1378        if (MULTICAST(daddr)) {
1379                int our = ip_check_mc(dev, daddr);
1380                if (!our
1381#ifdef CONFIG_IP_MROUTE
1382                    && (LOCAL_MCAST(daddr) || !dev->ip_ptr ||
1383                        !IN_DEV_MFORWARD((struct in_device*)dev->ip_ptr))
1384#endif
1385                    ) return -EINVAL;
1386                return ip_route_input_mc(skb, daddr, saddr, tos, dev, our);
1387        }
1388        return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1389}
1390
1391/*
1392 * Major route resolver routine.
1393 */
1394
1395int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
1396{
1397        struct rt_key key;
1398        struct fib_result res;
1399        unsigned flags = 0;
1400        struct rtable *rth;
1401        struct device *dev_out = NULL;
1402        unsigned hash;
1403#ifdef CONFIG_IP_TRANSPARENT_PROXY
1404        u32 nochecksrc = (tos & RTO_TPROXY);
1405#endif
1406
1407        tos &= IPTOS_TOS_MASK|RTO_ONLINK;
1408        key.dst = daddr;
1409        key.src = saddr;
1410        key.tos = tos&IPTOS_TOS_MASK;
1411        key.iif = loopback_dev.ifindex;
1412        key.oif = oif;
1413        key.scope = (tos&RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
1414        res.fi = NULL;
1415#ifdef CONFIG_IP_MULTIPLE_TABLES
1416        res.r = NULL;
1417#endif
1418
1419        if (saddr) {
1420                if (MULTICAST(saddr) || BADCLASS(saddr) || ZERONET(saddr))
1421                        return -EINVAL;
1422
1423                /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1424                dev_out = ip_dev_find(saddr);
1425#ifdef CONFIG_IP_TRANSPARENT_PROXY
1426                /* If address is not local, test for transparent proxy flag;
1427                   if address is local --- clear the flag.
1428                 */
1429                if (dev_out == NULL) {
1430                        if (nochecksrc == 0 || inet_addr_type(saddr) != RTN_UNICAST)
1431                                return -EINVAL;
1432                        flags |= RTCF_TPROXY;
1433                }
1434#else
1435                if (dev_out == NULL)
1436                        return -EINVAL;
1437#endif
1438
1439                /* I removed check for oif == dev_out->oif here.
1440                   It was wrong by three reasons:
1441                   1. ip_dev_find(saddr) can return wrong iface, if saddr is
1442                      assigned to multiple interfaces.
1443                   2. Moreover, we are allowed to send packets with saddr
1444                      of another iface. --ANK
1445                 */
1446
1447                if (oif == 0 &&
1448#ifdef CONFIG_IP_TRANSPARENT_PROXY
1449                        dev_out &&
1450#endif
1451                        (MULTICAST(daddr) || daddr == 0xFFFFFFFF)) {
1452                        /* Special hack: user can direct multicasts
1453                           and limited broadcast via necessary interface
1454                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1455                           This hack is not just for fun, it allows
1456                           vic,vat and friends to work.
1457                           They bind socket to loopback, set ttl to zero
1458                           and expect that it will work.
1459                           From the viewpoint of routing cache they are broken,
1460                           because we are not allowed to build multicast path
1461                           with loopback source addr (look, routing cache
1462                           cannot know, that ttl is zero, so that packet
1463                           will not leave this host and route is valid).
1464                           Luckily, this hack is good workaround.
1465                         */
1466
1467                        key.oif = dev_out->ifindex;
1468                        goto make_route;
1469                }
1470                dev_out = NULL;
1471        }
1472        if (oif) {
1473                dev_out = dev_get_by_index(oif);
1474                if (dev_out == NULL)
1475                        return -ENODEV;
1476                if (dev_out->ip_ptr == NULL)
1477                        return -ENODEV; /* Wrong error code */
1478
1479                if (LOCAL_MCAST(daddr) || daddr == 0xFFFFFFFF) {
1480                        key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK);
1481                        goto make_route;
1482                }
1483                if (MULTICAST(daddr))
1484                        key.src = inet_select_addr(dev_out, 0, key.scope);
1485                else if (!daddr)
1486                        key.src = inet_select_addr(dev_out, 0, RT_SCOPE_HOST);
1487        }
1488
1489        if (!key.dst) {
1490                key.dst = key.src;
1491                if (!key.dst)
1492                        key.dst = key.src = htonl(INADDR_LOOPBACK);
1493                dev_out = &loopback_dev;
1494                key.oif = loopback_dev.ifindex;
1495                res.type = RTN_LOCAL;
1496                flags |= RTCF_LOCAL;
1497                goto make_route;
1498        }
1499
1500        if (fib_lookup(&key, &res)) {
1501                res.fi = NULL;
1502                if (oif) {
1503                        /* Apparently, routing tables are wrong. Assume,
1504                           that the destination is on link.
1505
1506                           WHY? DW.
1507                           Because we are allowed to send to iface
1508                           even if it has NO routes and NO assigned
1509                           addresses. When oif is specified, routing
1510                           tables are looked up with only one purpose:
1511                           to catch if destination is gatewayed, rather than
1512                           direct. Moreover, if MSG_DONTROUTE is set,
1513                           we send packet, ignoring both routing tables
1514                           and ifaddr state. --ANK
1515
1516
1517                           We could make it even if oif is unknown,
1518                           likely IPv6, but we do not.
1519                         */
1520
1521                        if (key.src == 0)
1522                                key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK);
1523                        res.type = RTN_UNICAST;
1524                        goto make_route;
1525                }
1526                return -ENETUNREACH;
1527        }
1528
1529        if (res.type == RTN_NAT)
1530                return -EINVAL;
1531
1532        if (res.type == RTN_LOCAL) {
1533                if (!key.src)
1534                        key.src = key.dst;
1535                dev_out = &loopback_dev;
1536                key.oif = dev_out->ifindex;
1537                res.fi = NULL;
1538                flags |= RTCF_LOCAL;
1539                goto make_route;
1540        }
1541
1542#ifdef CONFIG_IP_ROUTE_MULTIPATH
1543        if (res.fi->fib_nhs > 1 && key.oif == 0)
1544                fib_select_multipath(&key, &res);
1545        else
1546#endif
1547        if (res.prefixlen==0 && res.type == RTN_UNICAST && key.oif == 0)
1548                fib_select_default(&key, &res);
1549
1550        if (!key.src)
1551                key.src = FIB_RES_PREFSRC(res);
1552
1553        dev_out = FIB_RES_DEV(res);
1554        key.oif = dev_out->ifindex;
1555
1556make_route:
1557        if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK))
1558                return -EINVAL;
1559
1560        if (key.dst == 0xFFFFFFFF)
1561                res.type = RTN_BROADCAST;
1562        else if (MULTICAST(key.dst))
1563                res.type = RTN_MULTICAST;
1564        else if (BADCLASS(key.dst) || ZERONET(key.dst))
1565                return -EINVAL;
1566
1567        if (dev_out->flags&IFF_LOOPBACK)
1568                flags |= RTCF_LOCAL;
1569
1570        if (res.type == RTN_BROADCAST) {
1571                flags |= RTCF_BROADCAST|RTCF_LOCAL;
1572                res.fi = NULL;
1573        } else if (res.type == RTN_MULTICAST) {
1574                flags |= RTCF_MULTICAST|RTCF_LOCAL;
1575                if (!ip_check_mc(dev_out, daddr))
1576                        flags &= ~RTCF_LOCAL;
1577                /* If multicast route do not exist use
1578                   default one, but do not gateway in this case.
1579                   Yes, it is hack.
1580                 */
1581                if (res.fi && res.prefixlen < 4)
1582                        res.fi = NULL;
1583        }
1584
1585        rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
1586        if (!rth)
1587                return -ENOBUFS;
1588
1589        atomic_set(&rth->u.dst.use, 1);
1590        rth->key.dst    = daddr;
1591        rth->key.tos    = tos;
1592        rth->key.src    = saddr;
1593        rth->key.iif    = 0;
1594        rth->key.oif    = oif;
1595        rth->rt_dst     = key.dst;
1596        rth->rt_src     = key.src;
1597#ifdef CONFIG_IP_ROUTE_NAT
1598        rth->rt_dst_map = key.dst;
1599        rth->rt_src_map = key.src;
1600#endif
1601        rth->rt_iif     = oif ? : dev_out->ifindex;
1602        rth->u.dst.dev  = dev_out;
1603        rth->rt_gateway = key.dst;
1604        rth->rt_spec_dst= key.src;
1605
1606        rth->u.dst.output=ip_output;
1607
1608        if (flags&RTCF_LOCAL) {
1609                rth->u.dst.input = ip_local_deliver;
1610                rth->rt_spec_dst = key.dst;
1611        }
1612        if (flags&(RTCF_BROADCAST|RTCF_MULTICAST)) {
1613                rth->rt_spec_dst = key.src;
1614                if (flags&RTCF_LOCAL && !(dev_out->flags&IFF_LOOPBACK))
1615                        rth->u.dst.output = ip_mc_output;
1616#ifdef CONFIG_IP_MROUTE
1617                if (res.type == RTN_MULTICAST && dev_out->ip_ptr) {
1618                        struct in_device *in_dev = dev_out->ip_ptr;
1619                        if (IN_DEV_MFORWARD(in_dev) && !LOCAL_MCAST(daddr)) {
1620                                rth->u.dst.input = ip_mr_input;
1621                                rth->u.dst.output = ip_mc_output;
1622                        }
1623                }
1624#endif
1625        }
1626
1627        rt_set_nexthop(rth, &res, 0);
1628
1629        rth->rt_flags = flags;
1630
1631        hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
1632        return rt_intern_hash(hash, rth, rp);
1633}
1634
1635int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
1636{
1637        unsigned hash;
1638        struct rtable *rth;
1639
1640        hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
1641
1642        start_bh_atomic();
1643        for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
1644                if (rth->key.dst == daddr &&
1645                    rth->key.src == saddr &&
1646                    rth->key.iif == 0 &&
1647                    rth->key.oif == oif &&
1648#ifndef CONFIG_IP_TRANSPARENT_PROXY
1649                    rth->key.tos == tos
1650#else
1651                    !((rth->key.tos^tos)&(IPTOS_TOS_MASK|RTO_ONLINK)) &&
1652                    ((tos&RTO_TPROXY) || !(rth->rt_flags&RTCF_TPROXY))
1653#endif
1654                ) {
1655                        rth->u.dst.lastuse = jiffies;
1656                        atomic_inc(&rth->u.dst.use);
1657                        atomic_inc(&rth->u.dst.refcnt);
1658                        end_bh_atomic();
1659                        *rp = rth;
1660                        return 0;
1661                }
1662        }
1663        end_bh_atomic();
1664
1665        return ip_route_output_slow(rp, daddr, saddr, tos, oif);
1666}
1667
1668#ifdef CONFIG_RTNETLINK
1669
1670static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait)
1671{
1672        struct rtable *rt = (struct rtable*)skb->dst;
1673        struct rtmsg *r;
1674        struct nlmsghdr  *nlh;
1675        unsigned char    *b = skb->tail;
1676        struct rta_cacheinfo ci;
1677#ifdef CONFIG_IP_MROUTE
1678        struct rtattr *eptr;
1679#endif
1680        struct rtattr *mx;
1681
1682        nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
1683        r = NLMSG_DATA(nlh);
1684        nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
1685        r->rtm_family = AF_INET;
1686        r->rtm_dst_len = 32;
1687        r->rtm_src_len = 0;
1688        r->rtm_tos = rt->key.tos;
1689        r->rtm_table = RT_TABLE_MAIN;
1690        r->rtm_type = rt->rt_type;
1691        r->rtm_scope = RT_SCOPE_UNIVERSE;
1692        r->rtm_protocol = RTPROT_UNSPEC;
1693        r->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED;
1694        if (rt->rt_flags & RTCF_NOTIFY)
1695                r->rtm_flags |= RTM_F_NOTIFY;
1696        RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
1697        if (rt->key.src) {
1698                r->rtm_src_len = 32;
1699                RTA_PUT(skb, RTA_SRC, 4, &rt->key.src);
1700        }
1701        if (rt->u.dst.dev)
1702                RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
1703#ifdef CONFIG_NET_CLS_ROUTE
1704        if (rt->u.dst.tclassid)
1705                RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
1706#endif
1707        if (rt->key.iif)
1708                RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
1709        else if (rt->rt_src != rt->key.src)
1710                RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
1711        if (rt->rt_dst != rt->rt_gateway)
1712                RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
1713        mx = (struct rtattr*)skb->tail;
1714        RTA_PUT(skb, RTA_METRICS, 0, NULL);
1715        if (rt->u.dst.mxlock)
1716                RTA_PUT(skb, RTAX_LOCK, sizeof(unsigned), &rt->u.dst.mxlock);
1717        if (rt->u.dst.pmtu)
1718                RTA_PUT(skb, RTAX_MTU, sizeof(unsigned), &rt->u.dst.pmtu);
1719        if (rt->u.dst.window)
1720                RTA_PUT(skb, RTAX_WINDOW, sizeof(unsigned), &rt->u.dst.window);
1721        if (rt->u.dst.rtt)
1722                RTA_PUT(skb, RTAX_RTT, sizeof(unsigned), &rt->u.dst.rtt);
1723        mx->rta_len = skb->tail - (u8*)mx;
1724        if (mx->rta_len == RTA_LENGTH(0))
1725                skb_trim(skb, (u8*)mx - skb->data);
1726        ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
1727        ci.rta_used = atomic_read(&rt->u.dst.refcnt);
1728        ci.rta_clntref = atomic_read(&rt->u.dst.use);
1729        if (rt->u.dst.expires)
1730                ci.rta_expires = rt->u.dst.expires - jiffies;
1731        else
1732                ci.rta_expires = 0;
1733        ci.rta_error = rt->u.dst.error;
1734#ifdef CONFIG_IP_MROUTE
1735        eptr = (struct rtattr*)skb->tail;
1736#endif
1737        RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1738        if (rt->key.iif) {
1739#ifdef CONFIG_IP_MROUTE
1740                u32 dst = rt->rt_dst;
1741
1742                if (MULTICAST(dst) && !LOCAL_MCAST(dst) && ipv4_devconf.mc_forwarding) {
1743                        int err = ipmr_get_route(skb, r, nowait);
1744                        if (err <= 0) {
1745                                if (!nowait) {
1746                                        if (err == 0)
1747                                                return 0;
1748                                        goto nlmsg_failure;
1749                                } else {
1750                                        if (err == -EMSGSIZE)
1751                                                goto nlmsg_failure;
1752                                        ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
1753                                }
1754                        }
1755                } else
1756#endif
1757                {
1758                        RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif);
1759                }
1760        }
1761
1762        nlh->nlmsg_len = skb->tail - b;
1763        return skb->len;
1764
1765nlmsg_failure:
1766rtattr_failure:
1767        skb_trim(skb, b - skb->data);
1768        return -1;
1769}
1770
1771int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1772{
1773        struct rtattr **rta = arg;
1774        struct rtmsg *rtm = NLMSG_DATA(nlh);
1775        struct rtable *rt = NULL;
1776        u32 dst = 0;
1777        u32 src = 0;
1778        int iif = 0;
1779        int err;
1780        struct sk_buff *skb;
1781
1782        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1783        if (skb == NULL)
1784                return -ENOBUFS;
1785
1786        /* Reserve room for dummy headers, this skb can pass
1787           through good chunk of routing engine.
1788         */
1789        skb->mac.raw = skb->data;
1790        skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
1791
1792        if (rta[RTA_SRC-1])
1793                memcpy(&src, RTA_DATA(rta[RTA_SRC-1]), 4);
1794        if (rta[RTA_DST-1])
1795                memcpy(&dst, RTA_DATA(rta[RTA_DST-1]), 4);
1796        if (rta[RTA_IIF-1])
1797                memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1798
1799        if (iif) {
1800                struct device *dev;
1801                dev = dev_get_by_index(iif);
1802                if (!dev)
1803                        return -ENODEV;
1804                skb->protocol = __constant_htons(ETH_P_IP);
1805                skb->dev = dev;
1806                start_bh_atomic();
1807                err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
1808                end_bh_atomic();
1809                rt = (struct rtable*)skb->dst;
1810                if (!err && rt->u.dst.error)
1811                        err = -rt->u.dst.error;
1812        } else {
1813                int oif = 0;
1814                if (rta[RTA_OIF-1])
1815                        memcpy(&oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1816                err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif);
1817        }
1818        if (err) {
1819                kfree_skb(skb);
1820                return err;
1821        }
1822
1823        skb->dst = &rt->u.dst;
1824        if (rtm->rtm_flags & RTM_F_NOTIFY)
1825                rt->rt_flags |= RTCF_NOTIFY;
1826
1827        NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1828
1829        err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0);
1830        if (err == 0)
1831                return 0;
1832        if (err < 0)
1833                return -EMSGSIZE;
1834
1835        err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1836        if (err < 0)
1837                return err;
1838        return 0;
1839}
1840
1841
1842int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
1843{
1844        struct rtable *rt;
1845        int h, s_h;
1846        int idx, s_idx;
1847
1848        s_h = cb->args[0];
1849        s_idx = idx = cb->args[1];
1850        for (h=0; h < RT_HASH_DIVISOR; h++) {
1851                if (h < s_h) continue;
1852                if (h > s_h)
1853                        memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0]));
1854                start_bh_atomic();
1855                for (rt = rt_hash_table[h], idx = 0; rt; rt = rt->u.rt_next, idx++) {
1856                        if (idx < s_idx)
1857                                continue;
1858                        skb->dst = dst_clone(&rt->u.dst);
1859                        if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
1860                                         cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) {
1861                                dst_release(xchg(&skb->dst, NULL));
1862                                end_bh_atomic();
1863                                goto done;
1864                        }
1865                        dst_release(xchg(&skb->dst, NULL));
1866                }
1867                end_bh_atomic();
1868        }
1869
1870done:
1871        cb->args[0] = h;
1872        cb->args[1] = idx;
1873        return skb->len;
1874}
1875
1876#endif /* CONFIG_RTNETLINK */
1877
1878void ip_rt_multicast_event(struct in_device *in_dev)
1879{
1880        rt_cache_flush(0);
1881}
1882
1883
1884
1885#ifdef CONFIG_SYSCTL
1886
1887static int flush_delay;
1888
1889static
1890int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
1891                              void *buffer, size_t *lenp)
1892{
1893        if (write) {
1894                proc_dointvec(ctl, write, filp, buffer, lenp);
1895                rt_cache_flush(flush_delay);
1896                return 0;
1897        } else
1898                return -EINVAL;
1899}
1900
1901ctl_table ipv4_route_table[] = {
1902        {NET_IPV4_ROUTE_FLUSH, "flush",
1903         &flush_delay, sizeof(int), 0200, NULL,
1904         &ipv4_sysctl_rtcache_flush},
1905        {NET_IPV4_ROUTE_MIN_DELAY, "min_delay",
1906         &ip_rt_min_delay, sizeof(int), 0644, NULL,
1907         &proc_dointvec_jiffies},
1908        {NET_IPV4_ROUTE_MAX_DELAY, "max_delay",
1909         &ip_rt_max_delay, sizeof(int), 0644, NULL,
1910         &proc_dointvec_jiffies},
1911        {NET_IPV4_ROUTE_GC_THRESH, "gc_thresh",
1912         &ipv4_dst_ops.gc_thresh, sizeof(int), 0644, NULL,
1913         &proc_dointvec},
1914        {NET_IPV4_ROUTE_MAX_SIZE, "max_size",
1915         &ip_rt_max_size, sizeof(int), 0644, NULL,
1916         &proc_dointvec},
1917        {NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval",
1918         &ip_rt_gc_min_interval, sizeof(int), 0644, NULL,
1919         &proc_dointvec_jiffies},
1920        {NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout",
1921         &ip_rt_gc_timeout, sizeof(int), 0644, NULL,
1922         &proc_dointvec_jiffies},
1923        {NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval",
1924         &ip_rt_gc_interval, sizeof(int), 0644, NULL,
1925         &proc_dointvec_jiffies},
1926        {NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load",
1927         &ip_rt_redirect_load, sizeof(int), 0644, NULL,
1928         &proc_dointvec},
1929        {NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number",
1930         &ip_rt_redirect_number, sizeof(int), 0644, NULL,
1931         &proc_dointvec},
1932        {NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence",
1933         &ip_rt_redirect_silence, sizeof(int), 0644, NULL,
1934         &proc_dointvec},
1935        {NET_IPV4_ROUTE_ERROR_COST, "error_cost",
1936         &ip_rt_error_cost, sizeof(int), 0644, NULL,
1937         &proc_dointvec},
1938        {NET_IPV4_ROUTE_ERROR_BURST, "error_burst",
1939         &ip_rt_error_burst, sizeof(int), 0644, NULL,
1940         &proc_dointvec},
1941        {NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity",
1942         &ip_rt_gc_elasticity, sizeof(int), 0644, NULL,
1943         &proc_dointvec},
1944        {NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires",
1945         &ip_rt_mtu_expires, sizeof(int), 0644, NULL,
1946         &proc_dointvec_jiffies},
1947         {0}
1948};
1949#endif
1950
1951#ifdef CONFIG_NET_CLS_ROUTE
1952struct ip_rt_acct ip_rt_acct[256];
1953
1954#ifdef CONFIG_PROC_FS
1955static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
1956                           int length, int *eof, void *data)
1957{
1958        *start=buffer;
1959
1960        if (offset + length > sizeof(ip_rt_acct)) {
1961                length = sizeof(ip_rt_acct) - offset;
1962                *eof = 1;
1963        }
1964        if (length > 0) {
1965                start_bh_atomic();
1966                memcpy(buffer, ((u8*)&ip_rt_acct)+offset, length);
1967                end_bh_atomic();
1968                return length;
1969        }
1970        return 0;
1971}
1972#endif
1973#endif
1974
1975
1976__initfunc(void ip_rt_init(void))
1977{
1978#ifdef CONFIG_PROC_FS
1979#ifdef CONFIG_NET_CLS_ROUTE
1980        struct proc_dir_entry *ent;
1981#endif
1982#endif
1983        devinet_init();
1984        ip_fib_init();
1985        rt_periodic_timer.function = rt_check_expire;
1986        /* All the timers, started at system startup tend
1987           to synchronize. Perturb it a bit.
1988         */
1989        rt_periodic_timer.expires = jiffies + net_random()%ip_rt_gc_interval
1990                + ip_rt_gc_interval;
1991        add_timer(&rt_periodic_timer);
1992
1993#ifdef CONFIG_PROC_FS
1994        proc_net_register(&(struct proc_dir_entry) {
1995                PROC_NET_RTCACHE, 8, "rt_cache",
1996                S_IFREG | S_IRUGO, 1, 0, 0,
1997                0, &proc_net_inode_operations,
1998                rt_cache_get_info
1999        });
2000#ifdef CONFIG_NET_CLS_ROUTE
2001        ent = create_proc_entry("net/rt_acct", 0, 0);
2002        ent->read_proc = ip_rt_acct_read;
2003#endif
2004#endif
2005}
2006
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.