linux/net/ipv4/route.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              ROUTE - implementation of the IP router.
   7 *
   8 * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  13 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  14 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  15 *
  16 * Fixes:
  17 *              Alan Cox        :       Verify area fixes.
  18 *              Alan Cox        :       cli() protects routing changes
  19 *              Rui Oliveira    :       ICMP routing table updates
  20 *              (rco@di.uminho.pt)      Routing table insertion and update
  21 *              Linus Torvalds  :       Rewrote bits to be sensible
  22 *              Alan Cox        :       Added BSD route gw semantics
  23 *              Alan Cox        :       Super /proc >4K
  24 *              Alan Cox        :       MTU in route table
  25 *              Alan Cox        :       MSS actually. Also added the window
  26 *                                      clamper.
  27 *              Sam Lantinga    :       Fixed route matching in rt_del()
  28 *              Alan Cox        :       Routing cache support.
  29 *              Alan Cox        :       Removed compatibility cruft.
  30 *              Alan Cox        :       RTF_REJECT support.
  31 *              Alan Cox        :       TCP irtt support.
  32 *              Jonathan Naylor :       Added Metric support.
  33 *      Miquel van Smoorenburg  :       BSD API fixes.
  34 *      Miquel van Smoorenburg  :       Metrics.
  35 *              Alan Cox        :       Use __u32 properly
  36 *              Alan Cox        :       Aligned routing errors more closely with BSD
  37 *                                      our system is still very different.
  38 *              Alan Cox        :       Faster /proc handling
  39 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  40 *                                      routing caches and better behaviour.
  41 *
  42 *              Olaf Erb        :       irtt wasn't being copied right.
  43 *              Bjorn Ekwall    :       Kerneld route support.
  44 *              Alan Cox        :       Multicast fixed (I hope)
  45 *              Pavel Krauz     :       Limited broadcast fixed
  46 *              Mike McLagan    :       Routing by source
  47 *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  48 *                                      route.c and rewritten from scratch.
  49 *              Andi Kleen      :       Load-limit warning messages.
  50 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  51 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  52 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  53 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  54 *              Marc Boucher    :       routing by fwmark
  55 *      Robert Olsson           :       Added rt_cache statistics
  56 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  57 *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  58 *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  59 *      Ilia Sotnikov           :       Removed TOS from hash calculations
  60 *
  61 *              This program is free software; you can redistribute it and/or
  62 *              modify it under the terms of the GNU General Public License
  63 *              as published by the Free Software Foundation; either version
  64 *              2 of the License, or (at your option) any later version.
  65 */
  66
  67#include <linux/module.h>
  68#include <asm/uaccess.h>
  69#include <asm/system.h>
  70#include <linux/bitops.h>
  71#include <linux/types.h>
  72#include <linux/kernel.h>
  73#include <linux/mm.h>
  74#include <linux/bootmem.h>
  75#include <linux/string.h>
  76#include <linux/socket.h>
  77#include <linux/sockios.h>
  78#include <linux/errno.h>
  79#include <linux/in.h>
  80#include <linux/inet.h>
  81#include <linux/netdevice.h>
  82#include <linux/proc_fs.h>
  83#include <linux/init.h>
  84#include <linux/workqueue.h>
  85#include <linux/skbuff.h>
  86#include <linux/inetdevice.h>
  87#include <linux/igmp.h>
  88#include <linux/pkt_sched.h>
  89#include <linux/mroute.h>
  90#include <linux/netfilter_ipv4.h>
  91#include <linux/random.h>
  92#include <linux/jhash.h>
  93#include <linux/rcupdate.h>
  94#include <linux/times.h>
  95#include <net/dst.h>
  96#include <net/net_namespace.h>
  97#include <net/protocol.h>
  98#include <net/ip.h>
  99#include <net/route.h>
 100#include <net/inetpeer.h>
 101#include <net/sock.h>
 102#include <net/ip_fib.h>
 103#include <net/arp.h>
 104#include <net/tcp.h>
 105#include <net/icmp.h>
 106#include <net/xfrm.h>
 107#include <net/netevent.h>
 108#include <net/rtnetlink.h>
 109#ifdef CONFIG_SYSCTL
 110#include <linux/sysctl.h>
 111#endif
 112
 113#define RT_FL_TOS(oldflp) \
 114    ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 115
 116#define IP_MAX_MTU      0xFFF0
 117
 118#define RT_GC_TIMEOUT (300*HZ)
 119
 120static int ip_rt_max_size;
 121static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 122static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 123static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 124static int ip_rt_redirect_number __read_mostly  = 9;
 125static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 126static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 127static int ip_rt_error_cost __read_mostly       = HZ;
 128static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 129static int ip_rt_gc_elasticity __read_mostly    = 8;
 130static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 131static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 132static int ip_rt_min_advmss __read_mostly       = 256;
 133static int ip_rt_secret_interval __read_mostly  = 10 * 60 * HZ;
 134
 135static void rt_worker_func(struct work_struct *work);
 136static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
 137static struct timer_list rt_secret_timer;
 138
 139/*
 140 *      Interface to generic destination cache.
 141 */
 142
 143static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 144static void              ipv4_dst_destroy(struct dst_entry *dst);
 145static void              ipv4_dst_ifdown(struct dst_entry *dst,
 146                                         struct net_device *dev, int how);
 147static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 148static void              ipv4_link_failure(struct sk_buff *skb);
 149static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 150static int rt_garbage_collect(struct dst_ops *ops);
 151
 152
 153static struct dst_ops ipv4_dst_ops = {
 154        .family =               AF_INET,
 155        .protocol =             __constant_htons(ETH_P_IP),
 156        .gc =                   rt_garbage_collect,
 157        .check =                ipv4_dst_check,
 158        .destroy =              ipv4_dst_destroy,
 159        .ifdown =               ipv4_dst_ifdown,
 160        .negative_advice =      ipv4_negative_advice,
 161        .link_failure =         ipv4_link_failure,
 162        .update_pmtu =          ip_rt_update_pmtu,
 163        .local_out =            __ip_local_out,
 164        .entry_size =           sizeof(struct rtable),
 165        .entries =              ATOMIC_INIT(0),
 166};
 167
 168#define ECN_OR_COST(class)      TC_PRIO_##class
 169
 170const __u8 ip_tos2prio[16] = {
 171        TC_PRIO_BESTEFFORT,
 172        ECN_OR_COST(FILLER),
 173        TC_PRIO_BESTEFFORT,
 174        ECN_OR_COST(BESTEFFORT),
 175        TC_PRIO_BULK,
 176        ECN_OR_COST(BULK),
 177        TC_PRIO_BULK,
 178        ECN_OR_COST(BULK),
 179        TC_PRIO_INTERACTIVE,
 180        ECN_OR_COST(INTERACTIVE),
 181        TC_PRIO_INTERACTIVE,
 182        ECN_OR_COST(INTERACTIVE),
 183        TC_PRIO_INTERACTIVE_BULK,
 184        ECN_OR_COST(INTERACTIVE_BULK),
 185        TC_PRIO_INTERACTIVE_BULK,
 186        ECN_OR_COST(INTERACTIVE_BULK)
 187};
 188
 189
 190/*
 191 * Route cache.
 192 */
 193
 194/* The locking scheme is rather straight forward:
 195 *
 196 * 1) Read-Copy Update protects the buckets of the central route hash.
 197 * 2) Only writers remove entries, and they hold the lock
 198 *    as they look at rtable reference counts.
 199 * 3) Only readers acquire references to rtable entries,
 200 *    they do so with atomic increments and with the
 201 *    lock held.
 202 */
 203
 204struct rt_hash_bucket {
 205        struct rtable   *chain;
 206};
 207#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 208        defined(CONFIG_PROVE_LOCKING)
 209/*
 210 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 211 * The size of this table is a power of two and depends on the number of CPUS.
 212 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 213 */
 214#ifdef CONFIG_LOCKDEP
 215# define RT_HASH_LOCK_SZ        256
 216#else
 217# if NR_CPUS >= 32
 218#  define RT_HASH_LOCK_SZ       4096
 219# elif NR_CPUS >= 16
 220#  define RT_HASH_LOCK_SZ       2048
 221# elif NR_CPUS >= 8
 222#  define RT_HASH_LOCK_SZ       1024
 223# elif NR_CPUS >= 4
 224#  define RT_HASH_LOCK_SZ       512
 225# else
 226#  define RT_HASH_LOCK_SZ       256
 227# endif
 228#endif
 229
 230static spinlock_t       *rt_hash_locks;
 231# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 232
 233static __init void rt_hash_lock_init(void)
 234{
 235        int i;
 236
 237        rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 238                        GFP_KERNEL);
 239        if (!rt_hash_locks)
 240                panic("IP: failed to allocate rt_hash_locks\n");
 241
 242        for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 243                spin_lock_init(&rt_hash_locks[i]);
 244}
 245#else
 246# define rt_hash_lock_addr(slot) NULL
 247
 248static inline void rt_hash_lock_init(void)
 249{
 250}
 251#endif
 252
 253static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 254static unsigned                 rt_hash_mask __read_mostly;
 255static unsigned int             rt_hash_log  __read_mostly;
 256static atomic_t                 rt_genid __read_mostly;
 257
 258static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 259#define RT_CACHE_STAT_INC(field) \
 260        (__raw_get_cpu_var(rt_cache_stat).field++)
 261
 262static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx)
 263{
 264        return jhash_3words((__force u32)(__be32)(daddr),
 265                            (__force u32)(__be32)(saddr),
 266                            idx, atomic_read(&rt_genid))
 267                & rt_hash_mask;
 268}
 269
 270#ifdef CONFIG_PROC_FS
 271struct rt_cache_iter_state {
 272        struct seq_net_private p;
 273        int bucket;
 274        int genid;
 275};
 276
 277static struct rtable *rt_cache_get_first(struct seq_file *seq)
 278{
 279        struct rt_cache_iter_state *st = seq->private;
 280        struct rtable *r = NULL;
 281
 282        for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 283                rcu_read_lock_bh();
 284                r = rcu_dereference(rt_hash_table[st->bucket].chain);
 285                while (r) {
 286                        if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
 287                            r->rt_genid == st->genid)
 288                                return r;
 289                        r = rcu_dereference(r->u.dst.rt_next);
 290                }
 291                rcu_read_unlock_bh();
 292        }
 293        return r;
 294}
 295
 296static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 297                                          struct rtable *r)
 298{
 299        struct rt_cache_iter_state *st = seq->private;
 300        r = r->u.dst.rt_next;
 301        while (!r) {
 302                rcu_read_unlock_bh();
 303                if (--st->bucket < 0)
 304                        break;
 305                rcu_read_lock_bh();
 306                r = rt_hash_table[st->bucket].chain;
 307        }
 308        return rcu_dereference(r);
 309}
 310
 311static struct rtable *rt_cache_get_next(struct seq_file *seq,
 312                                        struct rtable *r)
 313{
 314        struct rt_cache_iter_state *st = seq->private;
 315        while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 316                if (dev_net(r->u.dst.dev) != seq_file_net(seq))
 317                        continue;
 318                if (r->rt_genid == st->genid)
 319                        break;
 320        }
 321        return r;
 322}
 323
 324static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 325{
 326        struct rtable *r = rt_cache_get_first(seq);
 327
 328        if (r)
 329                while (pos && (r = rt_cache_get_next(seq, r)))
 330                        --pos;
 331        return pos ? NULL : r;
 332}
 333
 334static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 335{
 336        struct rt_cache_iter_state *st = seq->private;
 337        if (*pos)
 338                return rt_cache_get_idx(seq, *pos - 1);
 339        st->genid = atomic_read(&rt_genid);
 340        return SEQ_START_TOKEN;
 341}
 342
 343static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 344{
 345        struct rtable *r;
 346
 347        if (v == SEQ_START_TOKEN)
 348                r = rt_cache_get_first(seq);
 349        else
 350                r = rt_cache_get_next(seq, v);
 351        ++*pos;
 352        return r;
 353}
 354
 355static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 356{
 357        if (v && v != SEQ_START_TOKEN)
 358                rcu_read_unlock_bh();
 359}
 360
 361static int rt_cache_seq_show(struct seq_file *seq, void *v)
 362{
 363        if (v == SEQ_START_TOKEN)
 364                seq_printf(seq, "%-127s\n",
 365                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 366                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 367                           "HHUptod\tSpecDst");
 368        else {
 369                struct rtable *r = v;
 370                int len;
 371
 372                seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 373                              "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 374                        r->u.dst.dev ? r->u.dst.dev->name : "*",
 375                        (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 376                        r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 377                        r->u.dst.__use, 0, (unsigned long)r->rt_src,
 378                        (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 379                             (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 380                        dst_metric(&r->u.dst, RTAX_WINDOW),
 381                        (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 382                              dst_metric(&r->u.dst, RTAX_RTTVAR)),
 383                        r->fl.fl4_tos,
 384                        r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 385                        r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 386                                       dev_queue_xmit) : 0,
 387                        r->rt_spec_dst, &len);
 388
 389                seq_printf(seq, "%*s\n", 127 - len, "");
 390        }
 391        return 0;
 392}
 393
 394static const struct seq_operations rt_cache_seq_ops = {
 395        .start  = rt_cache_seq_start,
 396        .next   = rt_cache_seq_next,
 397        .stop   = rt_cache_seq_stop,
 398        .show   = rt_cache_seq_show,
 399};
 400
 401static int rt_cache_seq_open(struct inode *inode, struct file *file)
 402{
 403        return seq_open_net(inode, file, &rt_cache_seq_ops,
 404                        sizeof(struct rt_cache_iter_state));
 405}
 406
 407static const struct file_operations rt_cache_seq_fops = {
 408        .owner   = THIS_MODULE,
 409        .open    = rt_cache_seq_open,
 410        .read    = seq_read,
 411        .llseek  = seq_lseek,
 412        .release = seq_release_net,
 413};
 414
 415
 416static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 417{
 418        int cpu;
 419
 420        if (*pos == 0)
 421                return SEQ_START_TOKEN;
 422
 423        for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
 424                if (!cpu_possible(cpu))
 425                        continue;
 426                *pos = cpu+1;
 427                return &per_cpu(rt_cache_stat, cpu);
 428        }
 429        return NULL;
 430}
 431
 432static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 433{
 434        int cpu;
 435
 436        for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 437                if (!cpu_possible(cpu))
 438                        continue;
 439                *pos = cpu+1;
 440                return &per_cpu(rt_cache_stat, cpu);
 441        }
 442        return NULL;
 443
 444}
 445
 446static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 447{
 448
 449}
 450
 451static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 452{
 453        struct rt_cache_stat *st = v;
 454
 455        if (v == SEQ_START_TOKEN) {
 456                seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 457                return 0;
 458        }
 459
 460        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 461                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 462                   atomic_read(&ipv4_dst_ops.entries),
 463                   st->in_hit,
 464                   st->in_slow_tot,
 465                   st->in_slow_mc,
 466                   st->in_no_route,
 467                   st->in_brd,
 468                   st->in_martian_dst,
 469                   st->in_martian_src,
 470
 471                   st->out_hit,
 472                   st->out_slow_tot,
 473                   st->out_slow_mc,
 474
 475                   st->gc_total,
 476                   st->gc_ignored,
 477                   st->gc_goal_miss,
 478                   st->gc_dst_overflow,
 479                   st->in_hlist_search,
 480                   st->out_hlist_search
 481                );
 482        return 0;
 483}
 484
 485static const struct seq_operations rt_cpu_seq_ops = {
 486        .start  = rt_cpu_seq_start,
 487        .next   = rt_cpu_seq_next,
 488        .stop   = rt_cpu_seq_stop,
 489        .show   = rt_cpu_seq_show,
 490};
 491
 492
 493static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 494{
 495        return seq_open(file, &rt_cpu_seq_ops);
 496}
 497
 498static const struct file_operations rt_cpu_seq_fops = {
 499        .owner   = THIS_MODULE,
 500        .open    = rt_cpu_seq_open,
 501        .read    = seq_read,
 502        .llseek  = seq_lseek,
 503        .release = seq_release,
 504};
 505
 506#ifdef CONFIG_NET_CLS_ROUTE
 507static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
 508                           int length, int *eof, void *data)
 509{
 510        unsigned int i;
 511
 512        if ((offset & 3) || (length & 3))
 513                return -EIO;
 514
 515        if (offset >= sizeof(struct ip_rt_acct) * 256) {
 516                *eof = 1;
 517                return 0;
 518        }
 519
 520        if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
 521                length = sizeof(struct ip_rt_acct) * 256 - offset;
 522                *eof = 1;
 523        }
 524
 525        offset /= sizeof(u32);
 526
 527        if (length > 0) {
 528                u32 *dst = (u32 *) buffer;
 529
 530                *start = buffer;
 531                memset(dst, 0, length);
 532
 533                for_each_possible_cpu(i) {
 534                        unsigned int j;
 535                        u32 *src;
 536
 537                        src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
 538                        for (j = 0; j < length/4; j++)
 539                                dst[j] += src[j];
 540                }
 541        }
 542        return length;
 543}
 544#endif
 545
 546static int __net_init ip_rt_do_proc_init(struct net *net)
 547{
 548        struct proc_dir_entry *pde;
 549
 550        pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 551                        &rt_cache_seq_fops);
 552        if (!pde)
 553                goto err1;
 554
 555        pde = proc_create("rt_cache", S_IRUGO,
 556                          net->proc_net_stat, &rt_cpu_seq_fops);
 557        if (!pde)
 558                goto err2;
 559
 560#ifdef CONFIG_NET_CLS_ROUTE
 561        pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
 562                        ip_rt_acct_read, NULL);
 563        if (!pde)
 564                goto err3;
 565#endif
 566        return 0;
 567
 568#ifdef CONFIG_NET_CLS_ROUTE
 569err3:
 570        remove_proc_entry("rt_cache", net->proc_net_stat);
 571#endif
 572err2:
 573        remove_proc_entry("rt_cache", net->proc_net);
 574err1:
 575        return -ENOMEM;
 576}
 577
 578static void __net_exit ip_rt_do_proc_exit(struct net *net)
 579{
 580        remove_proc_entry("rt_cache", net->proc_net_stat);
 581        remove_proc_entry("rt_cache", net->proc_net);
 582        remove_proc_entry("rt_acct", net->proc_net);
 583}
 584
 585static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 586        .init = ip_rt_do_proc_init,
 587        .exit = ip_rt_do_proc_exit,
 588};
 589
 590static int __init ip_rt_proc_init(void)
 591{
 592        return register_pernet_subsys(&ip_rt_proc_ops);
 593}
 594
 595#else
 596static inline int ip_rt_proc_init(void)
 597{
 598        return 0;
 599}
 600#endif /* CONFIG_PROC_FS */
 601
 602static inline void rt_free(struct rtable *rt)
 603{
 604        call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 605}
 606
 607static inline void rt_drop(struct rtable *rt)
 608{
 609        ip_rt_put(rt);
 610        call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 611}
 612
 613static inline int rt_fast_clean(struct rtable *rth)
 614{
 615        /* Kill broadcast/multicast entries very aggresively, if they
 616           collide in hash table with more useful entries */
 617        return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 618                rth->fl.iif && rth->u.dst.rt_next;
 619}
 620
 621static inline int rt_valuable(struct rtable *rth)
 622{
 623        return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 624                rth->u.dst.expires;
 625}
 626
 627static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 628{
 629        unsigned long age;
 630        int ret = 0;
 631
 632        if (atomic_read(&rth->u.dst.__refcnt))
 633                goto out;
 634
 635        ret = 1;
 636        if (rth->u.dst.expires &&
 637            time_after_eq(jiffies, rth->u.dst.expires))
 638                goto out;
 639
 640        age = jiffies - rth->u.dst.lastuse;
 641        ret = 0;
 642        if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 643            (age <= tmo2 && rt_valuable(rth)))
 644                goto out;
 645        ret = 1;
 646out:    return ret;
 647}
 648
 649/* Bits of score are:
 650 * 31: very valuable
 651 * 30: not quite useless
 652 * 29..0: usage counter
 653 */
 654static inline u32 rt_score(struct rtable *rt)
 655{
 656        u32 score = jiffies - rt->u.dst.lastuse;
 657
 658        score = ~score & ~(3<<30);
 659
 660        if (rt_valuable(rt))
 661                score |= (1<<31);
 662
 663        if (!rt->fl.iif ||
 664            !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 665                score |= (1<<30);
 666
 667        return score;
 668}
 669
 670static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 671{
 672        return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 673                (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
 674                (fl1->mark ^ fl2->mark) |
 675                (*(u16 *)&fl1->nl_u.ip4_u.tos ^
 676                 *(u16 *)&fl2->nl_u.ip4_u.tos) |
 677                (fl1->oif ^ fl2->oif) |
 678                (fl1->iif ^ fl2->iif)) == 0;
 679}
 680
 681static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 682{
 683        return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
 684}
 685
 686/*
 687 * Perform a full scan of hash table and free all entries.
 688 * Can be called by a softirq or a process.
 689 * In the later case, we want to be reschedule if necessary
 690 */
 691static void rt_do_flush(int process_context)
 692{
 693        unsigned int i;
 694        struct rtable *rth, *next;
 695
 696        for (i = 0; i <= rt_hash_mask; i++) {
 697                if (process_context && need_resched())
 698                        cond_resched();
 699                rth = rt_hash_table[i].chain;
 700                if (!rth)
 701                        continue;
 702
 703                spin_lock_bh(rt_hash_lock_addr(i));
 704                rth = rt_hash_table[i].chain;
 705                rt_hash_table[i].chain = NULL;
 706                spin_unlock_bh(rt_hash_lock_addr(i));
 707
 708                for (; rth; rth = next) {
 709                        next = rth->u.dst.rt_next;
 710                        rt_free(rth);
 711                }
 712        }
 713}
 714
 715static void rt_check_expire(void)
 716{
 717        static unsigned int rover;
 718        unsigned int i = rover, goal;
 719        struct rtable *rth, **rthp;
 720        u64 mult;
 721
 722        mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
 723        if (ip_rt_gc_timeout > 1)
 724                do_div(mult, ip_rt_gc_timeout);
 725        goal = (unsigned int)mult;
 726        if (goal > rt_hash_mask)
 727                goal = rt_hash_mask + 1;
 728        for (; goal > 0; goal--) {
 729                unsigned long tmo = ip_rt_gc_timeout;
 730
 731                i = (i + 1) & rt_hash_mask;
 732                rthp = &rt_hash_table[i].chain;
 733
 734                if (need_resched())
 735                        cond_resched();
 736
 737                if (*rthp == NULL)
 738                        continue;
 739                spin_lock_bh(rt_hash_lock_addr(i));
 740                while ((rth = *rthp) != NULL) {
 741                        if (rth->rt_genid != atomic_read(&rt_genid)) {
 742                                *rthp = rth->u.dst.rt_next;
 743                                rt_free(rth);
 744                                continue;
 745                        }
 746                        if (rth->u.dst.expires) {
 747                                /* Entry is expired even if it is in use */
 748                                if (time_before_eq(jiffies, rth->u.dst.expires)) {
 749                                        tmo >>= 1;
 750                                        rthp = &rth->u.dst.rt_next;
 751                                        continue;
 752                                }
 753                        } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 754                                tmo >>= 1;
 755                                rthp = &rth->u.dst.rt_next;
 756                                continue;
 757                        }
 758
 759                        /* Cleanup aged off entries. */
 760                        *rthp = rth->u.dst.rt_next;
 761                        rt_free(rth);
 762                }
 763                spin_unlock_bh(rt_hash_lock_addr(i));
 764        }
 765        rover = i;
 766}
 767
 768/*
 769 * rt_worker_func() is run in process context.
 770 * we call rt_check_expire() to scan part of the hash table
 771 */
 772static void rt_worker_func(struct work_struct *work)
 773{
 774        rt_check_expire();
 775        schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 776}
 777
 778/*
 779 * Pertubation of rt_genid by a small quantity [1..256]
 780 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 781 * many times (2^24) without giving recent rt_genid.
 782 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 783 */
 784static void rt_cache_invalidate(void)
 785{
 786        unsigned char shuffle;
 787
 788        get_random_bytes(&shuffle, sizeof(shuffle));
 789        atomic_add(shuffle + 1U, &rt_genid);
 790}
 791
 792/*
 793 * delay < 0  : invalidate cache (fast : entries will be deleted later)
 794 * delay >= 0 : invalidate & flush cache (can be long)
 795 */
 796void rt_cache_flush(int delay)
 797{
 798        rt_cache_invalidate();
 799        if (delay >= 0)
 800                rt_do_flush(!in_softirq());
 801}
 802
 803/*
 804 * We change rt_genid and let gc do the cleanup
 805 */
 806static void rt_secret_rebuild(unsigned long dummy)
 807{
 808        rt_cache_invalidate();
 809        mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
 810}
 811
 812/*
 813   Short description of GC goals.
 814
 815   We want to build algorithm, which will keep routing cache
 816   at some equilibrium point, when number of aged off entries
 817   is kept approximately equal to newly generated ones.
 818
 819   Current expiration strength is variable "expire".
 820   We try to adjust it dynamically, so that if networking
 821   is idle expires is large enough to keep enough of warm entries,
 822   and when load increases it reduces to limit cache size.
 823 */
 824
 825static int rt_garbage_collect(struct dst_ops *ops)
 826{
 827        static unsigned long expire = RT_GC_TIMEOUT;
 828        static unsigned long last_gc;
 829        static int rover;
 830        static int equilibrium;
 831        struct rtable *rth, **rthp;
 832        unsigned long now = jiffies;
 833        int goal;
 834
 835        /*
 836         * Garbage collection is pretty expensive,
 837         * do not make it too frequently.
 838         */
 839
 840        RT_CACHE_STAT_INC(gc_total);
 841
 842        if (now - last_gc < ip_rt_gc_min_interval &&
 843            atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 844                RT_CACHE_STAT_INC(gc_ignored);
 845                goto out;
 846        }
 847
 848        /* Calculate number of entries, which we want to expire now. */
 849        goal = atomic_read(&ipv4_dst_ops.entries) -
 850                (ip_rt_gc_elasticity << rt_hash_log);
 851        if (goal <= 0) {
 852                if (equilibrium < ipv4_dst_ops.gc_thresh)
 853                        equilibrium = ipv4_dst_ops.gc_thresh;
 854                goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 855                if (goal > 0) {
 856                        equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 857                        goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 858                }
 859        } else {
 860                /* We are in dangerous area. Try to reduce cache really
 861                 * aggressively.
 862                 */
 863                goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 864                equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 865        }
 866
 867        if (now - last_gc >= ip_rt_gc_min_interval)
 868                last_gc = now;
 869
 870        if (goal <= 0) {
 871                equilibrium += goal;
 872                goto work_done;
 873        }
 874
 875        do {
 876                int i, k;
 877
 878                for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 879                        unsigned long tmo = expire;
 880
 881                        k = (k + 1) & rt_hash_mask;
 882                        rthp = &rt_hash_table[k].chain;
 883                        spin_lock_bh(rt_hash_lock_addr(k));
 884                        while ((rth = *rthp) != NULL) {
 885                                if (rth->rt_genid == atomic_read(&rt_genid) &&
 886                                        !rt_may_expire(rth, tmo, expire)) {
 887                                        tmo >>= 1;
 888                                        rthp = &rth->u.dst.rt_next;
 889                                        continue;
 890                                }
 891                                *rthp = rth->u.dst.rt_next;
 892                                rt_free(rth);
 893                                goal--;
 894                        }
 895                        spin_unlock_bh(rt_hash_lock_addr(k));
 896                        if (goal <= 0)
 897                                break;
 898                }
 899                rover = k;
 900
 901                if (goal <= 0)
 902                        goto work_done;
 903
 904                /* Goal is not achieved. We stop process if:
 905
 906                   - if expire reduced to zero. Otherwise, expire is halfed.
 907                   - if table is not full.
 908                   - if we are called from interrupt.
 909                   - jiffies check is just fallback/debug loop breaker.
 910                     We will not spin here for long time in any case.
 911                 */
 912
 913                RT_CACHE_STAT_INC(gc_goal_miss);
 914
 915                if (expire == 0)
 916                        break;
 917
 918                expire >>= 1;
 919#if RT_CACHE_DEBUG >= 2
 920                printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 921                                atomic_read(&ipv4_dst_ops.entries), goal, i);
 922#endif
 923
 924                if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 925                        goto out;
 926        } while (!in_softirq() && time_before_eq(jiffies, now));
 927
 928        if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 929                goto out;
 930        if (net_ratelimit())
 931                printk(KERN_WARNING "dst cache overflow\n");
 932        RT_CACHE_STAT_INC(gc_dst_overflow);
 933        return 1;
 934
 935work_done:
 936        expire += ip_rt_gc_min_interval;
 937        if (expire > ip_rt_gc_timeout ||
 938            atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 939                expire = ip_rt_gc_timeout;
 940#if RT_CACHE_DEBUG >= 2
 941        printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 942                        atomic_read(&ipv4_dst_ops.entries), goal, rover);
 943#endif
 944out:    return 0;
 945}
 946
 947static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 948{
 949        struct rtable   *rth, **rthp;
 950        unsigned long   now;
 951        struct rtable *cand, **candp;
 952        u32             min_score;
 953        int             chain_length;
 954        int attempts = !in_softirq();
 955
 956restart:
 957        chain_length = 0;
 958        min_score = ~(u32)0;
 959        cand = NULL;
 960        candp = NULL;
 961        now = jiffies;
 962
 963        rthp = &rt_hash_table[hash].chain;
 964
 965        spin_lock_bh(rt_hash_lock_addr(hash));
 966        while ((rth = *rthp) != NULL) {
 967                if (rth->rt_genid != atomic_read(&rt_genid)) {
 968                        *rthp = rth->u.dst.rt_next;
 969                        rt_free(rth);
 970                        continue;
 971                }
 972                if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
 973                        /* Put it first */
 974                        *rthp = rth->u.dst.rt_next;
 975                        /*
 976                         * Since lookup is lockfree, the deletion
 977                         * must be visible to another weakly ordered CPU before
 978                         * the insertion at the start of the hash chain.
 979                         */
 980                        rcu_assign_pointer(rth->u.dst.rt_next,
 981                                           rt_hash_table[hash].chain);
 982                        /*
 983                         * Since lookup is lockfree, the update writes
 984                         * must be ordered for consistency on SMP.
 985                         */
 986                        rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 987
 988                        dst_use(&rth->u.dst, now);
 989                        spin_unlock_bh(rt_hash_lock_addr(hash));
 990
 991                        rt_drop(rt);
 992                        *rp = rth;
 993                        return 0;
 994                }
 995
 996                if (!atomic_read(&rth->u.dst.__refcnt)) {
 997                        u32 score = rt_score(rth);
 998
 999                        if (score <= min_score) {
1000                                cand = rth;
1001                                candp = rthp;
1002                                min_score = score;
1003                        }
1004                }
1005
1006                chain_length++;
1007
1008                rthp = &rth->u.dst.rt_next;
1009        }
1010
1011        if (cand) {
1012                /* ip_rt_gc_elasticity used to be average length of chain
1013                 * length, when exceeded gc becomes really aggressive.
1014                 *
1015                 * The second limit is less certain. At the moment it allows
1016                 * only 2 entries per bucket. We will see.
1017                 */
1018                if (chain_length > ip_rt_gc_elasticity) {
1019                        *candp = cand->u.dst.rt_next;
1020                        rt_free(cand);
1021                }
1022        }
1023
1024        /* Try to bind route to arp only if it is output
1025           route or unicast forwarding path.
1026         */
1027        if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1028                int err = arp_bind_neighbour(&rt->u.dst);
1029                if (err) {
1030                        spin_unlock_bh(rt_hash_lock_addr(hash));
1031
1032                        if (err != -ENOBUFS) {
1033                                rt_drop(rt);
1034                                return err;
1035                        }
1036
1037                        /* Neighbour tables are full and nothing
1038                           can be released. Try to shrink route cache,
1039                           it is most likely it holds some neighbour records.
1040                         */
1041                        if (attempts-- > 0) {
1042                                int saved_elasticity = ip_rt_gc_elasticity;
1043                                int saved_int = ip_rt_gc_min_interval;
1044                                ip_rt_gc_elasticity     = 1;
1045                                ip_rt_gc_min_interval   = 0;
1046                                rt_garbage_collect(&ipv4_dst_ops);
1047                                ip_rt_gc_min_interval   = saved_int;
1048                                ip_rt_gc_elasticity     = saved_elasticity;
1049                                goto restart;
1050                        }
1051
1052                        if (net_ratelimit())
1053                                printk(KERN_WARNING "Neighbour table overflow.\n");
1054                        rt_drop(rt);
1055                        return -ENOBUFS;
1056                }
1057        }
1058
1059        rt->u.dst.rt_next = rt_hash_table[hash].chain;
1060#if RT_CACHE_DEBUG >= 2
1061        if (rt->u.dst.rt_next) {
1062                struct rtable *trt;
1063                printk(KERN_DEBUG "rt_cache @%02x: " NIPQUAD_FMT, hash,
1064                       NIPQUAD(rt->rt_dst));
1065                for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1066                        printk(" . " NIPQUAD_FMT, NIPQUAD(trt->rt_dst));
1067                printk("\n");
1068        }
1069#endif
1070        rt_hash_table[hash].chain = rt;
1071        spin_unlock_bh(rt_hash_lock_addr(hash));
1072        *rp = rt;
1073        return 0;
1074}
1075
1076void rt_bind_peer(struct rtable *rt, int create)
1077{
1078        static DEFINE_SPINLOCK(rt_peer_lock);
1079        struct inet_peer *peer;
1080
1081        peer = inet_getpeer(rt->rt_dst, create);
1082
1083        spin_lock_bh(&rt_peer_lock);
1084        if (rt->peer == NULL) {
1085                rt->peer = peer;
1086                peer = NULL;
1087        }
1088        spin_unlock_bh(&rt_peer_lock);
1089        if (peer)
1090                inet_putpeer(peer);
1091}
1092
1093/*
1094 * Peer allocation may fail only in serious out-of-memory conditions.  However
1095 * we still can generate some output.
1096 * Random ID selection looks a bit dangerous because we have no chances to
1097 * select ID being unique in a reasonable period of time.
1098 * But broken packet identifier may be better than no packet at all.
1099 */
1100static void ip_select_fb_ident(struct iphdr *iph)
1101{
1102        static DEFINE_SPINLOCK(ip_fb_id_lock);
1103        static u32 ip_fallback_id;
1104        u32 salt;
1105
1106        spin_lock_bh(&ip_fb_id_lock);
1107        salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1108        iph->id = htons(salt & 0xFFFF);
1109        ip_fallback_id = salt;
1110        spin_unlock_bh(&ip_fb_id_lock);
1111}
1112
1113void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1114{
1115        struct rtable *rt = (struct rtable *) dst;
1116
1117        if (rt) {
1118                if (rt->peer == NULL)
1119                        rt_bind_peer(rt, 1);
1120
1121                /* If peer is attached to destination, it is never detached,
1122                   so that we need not to grab a lock to dereference it.
1123                 */
1124                if (rt->peer) {
1125                        iph->id = htons(inet_getid(rt->peer, more));
1126                        return;
1127                }
1128        } else
1129                printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1130                       __builtin_return_address(0));
1131
1132        ip_select_fb_ident(iph);
1133}
1134
1135static void rt_del(unsigned hash, struct rtable *rt)
1136{
1137        struct rtable **rthp, *aux;
1138
1139        rthp = &rt_hash_table[hash].chain;
1140        spin_lock_bh(rt_hash_lock_addr(hash));
1141        ip_rt_put(rt);
1142        while ((aux = *rthp) != NULL) {
1143                if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1144                        *rthp = aux->u.dst.rt_next;
1145                        rt_free(aux);
1146                        continue;
1147                }
1148                rthp = &aux->u.dst.rt_next;
1149        }
1150        spin_unlock_bh(rt_hash_lock_addr(hash));
1151}
1152
1153void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1154                    __be32 saddr, struct net_device *dev)
1155{
1156        int i, k;
1157        struct in_device *in_dev = in_dev_get(dev);
1158        struct rtable *rth, **rthp;
1159        __be32  skeys[2] = { saddr, 0 };
1160        int  ikeys[2] = { dev->ifindex, 0 };
1161        struct netevent_redirect netevent;
1162        struct net *net;
1163
1164        if (!in_dev)
1165                return;
1166
1167        net = dev_net(dev);
1168        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1169            || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1170            || ipv4_is_zeronet(new_gw))
1171                goto reject_redirect;
1172
1173        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1174                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1175                        goto reject_redirect;
1176                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1177                        goto reject_redirect;
1178        } else {
1179                if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1180                        goto reject_redirect;
1181        }
1182
1183        for (i = 0; i < 2; i++) {
1184                for (k = 0; k < 2; k++) {
1185                        unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1186
1187                        rthp=&rt_hash_table[hash].chain;
1188
1189                        rcu_read_lock();
1190                        while ((rth = rcu_dereference(*rthp)) != NULL) {
1191                                struct rtable *rt;
1192
1193                                if (rth->fl.fl4_dst != daddr ||
1194                                    rth->fl.fl4_src != skeys[i] ||
1195                                    rth->fl.oif != ikeys[k] ||
1196                                    rth->fl.iif != 0 ||
1197                                    rth->rt_genid != atomic_read(&rt_genid) ||
1198                                    !net_eq(dev_net(rth->u.dst.dev), net)) {
1199                                        rthp = &rth->u.dst.rt_next;
1200                                        continue;
1201                                }
1202
1203                                if (rth->rt_dst != daddr ||
1204                                    rth->rt_src != saddr ||
1205                                    rth->u.dst.error ||
1206                                    rth->rt_gateway != old_gw ||
1207                                    rth->u.dst.dev != dev)
1208                                        break;
1209
1210                                dst_hold(&rth->u.dst);
1211                                rcu_read_unlock();
1212
1213                                rt = dst_alloc(&ipv4_dst_ops);
1214                                if (rt == NULL) {
1215                                        ip_rt_put(rth);
1216                                        in_dev_put(in_dev);
1217                                        return;
1218                                }
1219
1220                                /* Copy all the information. */
1221                                *rt = *rth;
1222                                INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1223                                rt->u.dst.__use         = 1;
1224                                atomic_set(&rt->u.dst.__refcnt, 1);
1225                                rt->u.dst.child         = NULL;
1226                                if (rt->u.dst.dev)
1227                                        dev_hold(rt->u.dst.dev);
1228                                if (rt->idev)
1229                                        in_dev_hold(rt->idev);
1230                                rt->u.dst.obsolete      = 0;
1231                                rt->u.dst.lastuse       = jiffies;
1232                                rt->u.dst.path          = &rt->u.dst;
1233                                rt->u.dst.neighbour     = NULL;
1234                                rt->u.dst.hh            = NULL;
1235                                rt->u.dst.xfrm          = NULL;
1236                                rt->rt_genid            = atomic_read(&rt_genid);
1237                                rt->rt_flags            |= RTCF_REDIRECTED;
1238
1239                                /* Gateway is different ... */
1240                                rt->rt_gateway          = new_gw;
1241
1242                                /* Redirect received -> path was valid */
1243                                dst_confirm(&rth->u.dst);
1244
1245                                if (rt->peer)
1246                                        atomic_inc(&rt->peer->refcnt);
1247
1248                                if (arp_bind_neighbour(&rt->u.dst) ||
1249                                    !(rt->u.dst.neighbour->nud_state &
1250                                            NUD_VALID)) {
1251                                        if (rt->u.dst.neighbour)
1252                                                neigh_event_send(rt->u.dst.neighbour, NULL);
1253                                        ip_rt_put(rth);
1254                                        rt_drop(rt);
1255                                        goto do_next;
1256                                }
1257
1258                                netevent.old = &rth->u.dst;
1259                                netevent.new = &rt->u.dst;
1260                                call_netevent_notifiers(NETEVENT_REDIRECT,
1261                                                        &netevent);
1262
1263                                rt_del(hash, rth);
1264                                if (!rt_intern_hash(hash, rt, &rt))
1265                                        ip_rt_put(rt);
1266                                goto do_next;
1267                        }
1268                        rcu_read_unlock();
1269                do_next:
1270                        ;
1271                }
1272        }
1273        in_dev_put(in_dev);
1274        return;
1275
1276reject_redirect:
1277#ifdef CONFIG_IP_ROUTE_VERBOSE
1278        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1279                printk(KERN_INFO "Redirect from " NIPQUAD_FMT " on %s about "
1280                        NIPQUAD_FMT " ignored.\n"
1281                        "  Advised path = " NIPQUAD_FMT " -> " NIPQUAD_FMT "\n",
1282                       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1283                       NIPQUAD(saddr), NIPQUAD(daddr));
1284#endif
1285        in_dev_put(in_dev);
1286}
1287
1288static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1289{
1290        struct rtable *rt = (struct rtable *)dst;
1291        struct dst_entry *ret = dst;
1292
1293        if (rt) {
1294                if (dst->obsolete) {
1295                        ip_rt_put(rt);
1296                        ret = NULL;
1297                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1298                           rt->u.dst.expires) {
1299                        unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1300                                                rt->fl.oif);
1301#if RT_CACHE_DEBUG >= 1
1302                        printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1303                                          NIPQUAD_FMT "/%02x dropped\n",
1304                                NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1305#endif
1306                        rt_del(hash, rt);
1307                        ret = NULL;
1308                }
1309        }
1310        return ret;
1311}
1312
1313/*
1314 * Algorithm:
1315 *      1. The first ip_rt_redirect_number redirects are sent
1316 *         with exponential backoff, then we stop sending them at all,
1317 *         assuming that the host ignores our redirects.
1318 *      2. If we did not see packets requiring redirects
1319 *         during ip_rt_redirect_silence, we assume that the host
1320 *         forgot redirected route and start to send redirects again.
1321 *
1322 * This algorithm is much cheaper and more intelligent than dumb load limiting
1323 * in icmp.c.
1324 *
1325 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1326 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1327 */
1328
1329void ip_rt_send_redirect(struct sk_buff *skb)
1330{
1331        struct rtable *rt = skb->rtable;
1332        struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1333
1334        if (!in_dev)
1335                return;
1336
1337        if (!IN_DEV_TX_REDIRECTS(in_dev))
1338                goto out;
1339
1340        /* No redirected packets during ip_rt_redirect_silence;
1341         * reset the algorithm.
1342         */
1343        if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1344                rt->u.dst.rate_tokens = 0;
1345
1346        /* Too many ignored redirects; do not send anything
1347         * set u.dst.rate_last to the last seen redirected packet.
1348         */
1349        if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1350                rt->u.dst.rate_last = jiffies;
1351                goto out;
1352        }
1353
1354        /* Check for load limit; set rate_last to the latest sent
1355         * redirect.
1356         */
1357        if (rt->u.dst.rate_tokens == 0 ||
1358            time_after(jiffies,
1359                       (rt->u.dst.rate_last +
1360                        (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1361                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1362                rt->u.dst.rate_last = jiffies;
1363                ++rt->u.dst.rate_tokens;
1364#ifdef CONFIG_IP_ROUTE_VERBOSE
1365                if (IN_DEV_LOG_MARTIANS(in_dev) &&
1366                    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1367                    net_ratelimit())
1368                        printk(KERN_WARNING "host " NIPQUAD_FMT "/if%d ignores "
1369                                "redirects for " NIPQUAD_FMT " to " NIPQUAD_FMT ".\n",
1370                                NIPQUAD(rt->rt_src), rt->rt_iif,
1371                                NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1372#endif
1373        }
1374out:
1375        in_dev_put(in_dev);
1376}
1377
1378static int ip_error(struct sk_buff *skb)
1379{
1380        struct rtable *rt = skb->rtable;
1381        unsigned long now;
1382        int code;
1383
1384        switch (rt->u.dst.error) {
1385                case EINVAL:
1386                default:
1387                        goto out;
1388                case EHOSTUNREACH:
1389                        code = ICMP_HOST_UNREACH;
1390                        break;
1391                case ENETUNREACH:
1392                        code = ICMP_NET_UNREACH;
1393                        IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1394                        break;
1395                case EACCES:
1396                        code = ICMP_PKT_FILTERED;
1397                        break;
1398        }
1399
1400        now = jiffies;
1401        rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1402        if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1403                rt->u.dst.rate_tokens = ip_rt_error_burst;
1404        rt->u.dst.rate_last = now;
1405        if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1406                rt->u.dst.rate_tokens -= ip_rt_error_cost;
1407                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1408        }
1409
1410out:    kfree_skb(skb);
1411        return 0;
1412}
1413
1414/*
1415 *      The last two values are not from the RFC but
1416 *      are needed for AMPRnet AX.25 paths.
1417 */
1418
1419static const unsigned short mtu_plateau[] =
1420{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1421
1422static inline unsigned short guess_mtu(unsigned short old_mtu)
1423{
1424        int i;
1425
1426        for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1427                if (old_mtu > mtu_plateau[i])
1428                        return mtu_plateau[i];
1429        return 68;
1430}
1431
1432unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1433                                 unsigned short new_mtu,
1434                                 struct net_device *dev)
1435{
1436        int i, k;
1437        unsigned short old_mtu = ntohs(iph->tot_len);
1438        struct rtable *rth;
1439        int  ikeys[2] = { dev->ifindex, 0 };
1440        __be32  skeys[2] = { iph->saddr, 0, };
1441        __be32  daddr = iph->daddr;
1442        unsigned short est_mtu = 0;
1443
1444        if (ipv4_config.no_pmtu_disc)
1445                return 0;
1446
1447        for (k = 0; k < 2; k++) {
1448                for (i = 0; i < 2; i++) {
1449                        unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1450
1451                        rcu_read_lock();
1452                        for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1453                             rth = rcu_dereference(rth->u.dst.rt_next)) {
1454                                unsigned short mtu = new_mtu;
1455
1456                                if (rth->fl.fl4_dst != daddr ||
1457                                    rth->fl.fl4_src != skeys[i] ||
1458                                    rth->rt_dst != daddr ||
1459                                    rth->rt_src != iph->saddr ||
1460                                    rth->fl.oif != ikeys[k] ||
1461                                    rth->fl.iif != 0 ||
1462                                    dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1463                                    !net_eq(dev_net(rth->u.dst.dev), net) ||
1464                                    rth->rt_genid != atomic_read(&rt_genid))
1465                                        continue;
1466
1467                                if (new_mtu < 68 || new_mtu >= old_mtu) {
1468
1469                                        /* BSD 4.2 compatibility hack :-( */
1470                                        if (mtu == 0 &&
1471                                            old_mtu >= dst_metric(&rth->u.dst, RTAX_MTU) &&
1472                                            old_mtu >= 68 + (iph->ihl << 2))
1473                                                old_mtu -= iph->ihl << 2;
1474
1475                                        mtu = guess_mtu(old_mtu);
1476                                }
1477                                if (mtu <= dst_metric(&rth->u.dst, RTAX_MTU)) {
1478                                        if (mtu < dst_metric(&rth->u.dst, RTAX_MTU)) {
1479                                                dst_confirm(&rth->u.dst);
1480                                                if (mtu < ip_rt_min_pmtu) {
1481                                                        mtu = ip_rt_min_pmtu;
1482                                                        rth->u.dst.metrics[RTAX_LOCK-1] |=
1483                                                                (1 << RTAX_MTU);
1484                                                }
1485                                                rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1486                                                dst_set_expires(&rth->u.dst,
1487                                                        ip_rt_mtu_expires);
1488                                        }
1489                                        est_mtu = mtu;
1490                                }
1491                        }
1492                        rcu_read_unlock();
1493                }
1494        }
1495        return est_mtu ? : new_mtu;
1496}
1497
1498static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1499{
1500        if (dst_metric(dst, RTAX_MTU) > mtu && mtu >= 68 &&
1501            !(dst_metric_locked(dst, RTAX_MTU))) {
1502                if (mtu < ip_rt_min_pmtu) {
1503                        mtu = ip_rt_min_pmtu;
1504                        dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1505                }
1506                dst->metrics[RTAX_MTU-1] = mtu;
1507                dst_set_expires(dst, ip_rt_mtu_expires);
1508                call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1509        }
1510}
1511
1512static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1513{
1514        return NULL;
1515}
1516
1517static void ipv4_dst_destroy(struct dst_entry *dst)
1518{
1519        struct rtable *rt = (struct rtable *) dst;
1520        struct inet_peer *peer = rt->peer;
1521        struct in_device *idev = rt->idev;
1522
1523        if (peer) {
1524                rt->peer = NULL;
1525                inet_putpeer(peer);
1526        }
1527
1528        if (idev) {
1529                rt->idev = NULL;
1530                in_dev_put(idev);
1531        }
1532}
1533
1534static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1535                            int how)
1536{
1537        struct rtable *rt = (struct rtable *) dst;
1538        struct in_device *idev = rt->idev;
1539        if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1540                struct in_device *loopback_idev =
1541                        in_dev_get(dev_net(dev)->loopback_dev);
1542                if (loopback_idev) {
1543                        rt->idev = loopback_idev;
1544                        in_dev_put(idev);
1545                }
1546        }
1547}
1548
1549static void ipv4_link_failure(struct sk_buff *skb)
1550{
1551        struct rtable *rt;
1552
1553        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1554
1555        rt = skb->rtable;
1556        if (rt)
1557                dst_set_expires(&rt->u.dst, 0);
1558}
1559
1560static int ip_rt_bug(struct sk_buff *skb)
1561{
1562        printk(KERN_DEBUG "ip_rt_bug: " NIPQUAD_FMT " -> " NIPQUAD_FMT ", %s\n",
1563                NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1564                skb->dev ? skb->dev->name : "?");
1565        kfree_skb(skb);
1566        return 0;
1567}
1568
1569/*
1570   We do not cache source address of outgoing interface,
1571   because it is used only by IP RR, TS and SRR options,
1572   so that it out of fast path.
1573
1574   BTW remember: "addr" is allowed to be not aligned
1575   in IP options!
1576 */
1577
1578void ip_rt_get_source(u8 *addr, struct rtable *rt)
1579{
1580        __be32 src;
1581        struct fib_result res;
1582
1583        if (rt->fl.iif == 0)
1584                src = rt->rt_src;
1585        else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1586                src = FIB_RES_PREFSRC(res);
1587                fib_res_put(&res);
1588        } else
1589                src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1590                                        RT_SCOPE_UNIVERSE);
1591        memcpy(addr, &src, 4);
1592}
1593
1594#ifdef CONFIG_NET_CLS_ROUTE
1595static void set_class_tag(struct rtable *rt, u32 tag)
1596{
1597        if (!(rt->u.dst.tclassid & 0xFFFF))
1598                rt->u.dst.tclassid |= tag & 0xFFFF;
1599        if (!(rt->u.dst.tclassid & 0xFFFF0000))
1600                rt->u.dst.tclassid |= tag & 0xFFFF0000;
1601}
1602#endif
1603
1604static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1605{
1606        struct fib_info *fi = res->fi;
1607
1608        if (fi) {
1609                if (FIB_RES_GW(*res) &&
1610                    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1611                        rt->rt_gateway = FIB_RES_GW(*res);
1612                memcpy(rt->u.dst.metrics, fi->fib_metrics,
1613                       sizeof(rt->u.dst.metrics));
1614                if (fi->fib_mtu == 0) {
1615                        rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1616                        if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1617                            rt->rt_gateway != rt->rt_dst &&
1618                            rt->u.dst.dev->mtu > 576)
1619                                rt->u.dst.metrics[RTAX_MTU-1] = 576;
1620                }
1621#ifdef CONFIG_NET_CLS_ROUTE
1622                rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1623#endif
1624        } else
1625                rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1626
1627        if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1628                rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1629        if (dst_metric(&rt->u.dst, RTAX_MTU) > IP_MAX_MTU)
1630                rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1631        if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1632                rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1633                                       ip_rt_min_advmss);
1634        if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1635                rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1636
1637#ifdef CONFIG_NET_CLS_ROUTE
1638#ifdef CONFIG_IP_MULTIPLE_TABLES
1639        set_class_tag(rt, fib_rules_tclass(res));
1640#endif
1641        set_class_tag(rt, itag);
1642#endif
1643        rt->rt_type = res->type;
1644}
1645
1646static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1647                                u8 tos, struct net_device *dev, int our)
1648{
1649        unsigned hash;
1650        struct rtable *rth;
1651        __be32 spec_dst;
1652        struct in_device *in_dev = in_dev_get(dev);
1653        u32 itag = 0;
1654
1655        /* Primary sanity checks. */
1656
1657        if (in_dev == NULL)
1658                return -EINVAL;
1659
1660        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1661            ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1662                goto e_inval;
1663
1664        if (ipv4_is_zeronet(saddr)) {
1665                if (!ipv4_is_local_multicast(daddr))
1666                        goto e_inval;
1667                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1668        } else if (fib_validate_source(saddr, 0, tos, 0,
1669                                        dev, &spec_dst, &itag) < 0)
1670                goto e_inval;
1671
1672        rth = dst_alloc(&ipv4_dst_ops);
1673        if (!rth)
1674                goto e_nobufs;
1675
1676        rth->u.dst.output= ip_rt_bug;
1677
1678        atomic_set(&rth->u.dst.__refcnt, 1);
1679        rth->u.dst.flags= DST_HOST;
1680        if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1681                rth->u.dst.flags |= DST_NOPOLICY;
1682        rth->fl.fl4_dst = daddr;
1683        rth->rt_dst     = daddr;
1684        rth->fl.fl4_tos = tos;
1685        rth->fl.mark    = skb->mark;
1686        rth->fl.fl4_src = saddr;
1687        rth->rt_src     = saddr;
1688#ifdef CONFIG_NET_CLS_ROUTE
1689        rth->u.dst.tclassid = itag;
1690#endif
1691        rth->rt_iif     =
1692        rth->fl.iif     = dev->ifindex;
1693        rth->u.dst.dev  = init_net.loopback_dev;
1694        dev_hold(rth->u.dst.dev);
1695        rth->idev       = in_dev_get(rth->u.dst.dev);
1696        rth->fl.oif     = 0;
1697        rth->rt_gateway = daddr;
1698        rth->rt_spec_dst= spec_dst;
1699        rth->rt_genid   = atomic_read(&rt_genid);
1700        rth->rt_flags   = RTCF_MULTICAST;
1701        rth->rt_type    = RTN_MULTICAST;
1702        if (our) {
1703                rth->u.dst.input= ip_local_deliver;
1704                rth->rt_flags |= RTCF_LOCAL;
1705        }
1706
1707#ifdef CONFIG_IP_MROUTE
1708        if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1709                rth->u.dst.input = ip_mr_input;
1710#endif
1711        RT_CACHE_STAT_INC(in_slow_mc);
1712
1713        in_dev_put(in_dev);
1714        hash = rt_hash(daddr, saddr, dev->ifindex);
1715        return rt_intern_hash(hash, rth, &skb->rtable);
1716
1717e_nobufs:
1718        in_dev_put(in_dev);
1719        return -ENOBUFS;
1720
1721e_inval:
1722        in_dev_put(in_dev);
1723        return -EINVAL;
1724}
1725
1726
1727static void ip_handle_martian_source(struct net_device *dev,
1728                                     struct in_device *in_dev,
1729                                     struct sk_buff *skb,
1730                                     __be32 daddr,
1731                                     __be32 saddr)
1732{
1733        RT_CACHE_STAT_INC(in_martian_src);
1734#ifdef CONFIG_IP_ROUTE_VERBOSE
1735        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1736                /*
1737                 *      RFC1812 recommendation, if source is martian,
1738                 *      the only hint is MAC header.
1739                 */
1740                printk(KERN_WARNING "martian source " NIPQUAD_FMT " from "
1741                        NIPQUAD_FMT", on dev %s\n",
1742                        NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1743                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1744                        int i;
1745                        const unsigned char *p = skb_mac_header(skb);
1746                        printk(KERN_WARNING "ll header: ");
1747                        for (i = 0; i < dev->hard_header_len; i++, p++) {
1748                                printk("%02x", *p);
1749                                if (i < (dev->hard_header_len - 1))
1750                                        printk(":");
1751                        }
1752                        printk("\n");
1753                }
1754        }
1755#endif
1756}
1757
1758static int __mkroute_input(struct sk_buff *skb,
1759                           struct fib_result *res,
1760                           struct in_device *in_dev,
1761                           __be32 daddr, __be32 saddr, u32 tos,
1762                           struct rtable **result)
1763{
1764
1765        struct rtable *rth;
1766        int err;
1767        struct in_device *out_dev;
1768        unsigned flags = 0;
1769        __be32 spec_dst;
1770        u32 itag;
1771
1772        /* get a working reference to the output device */
1773        out_dev = in_dev_get(FIB_RES_DEV(*res));
1774        if (out_dev == NULL) {
1775                if (net_ratelimit())
1776                        printk(KERN_CRIT "Bug in ip_route_input" \
1777                               "_slow(). Please, report\n");
1778                return -EINVAL;
1779        }
1780
1781
1782        err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1783                                  in_dev->dev, &spec_dst, &itag);
1784        if (err < 0) {
1785                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1786                                         saddr);
1787
1788                err = -EINVAL;
1789                goto cleanup;
1790        }
1791
1792        if (err)
1793                flags |= RTCF_DIRECTSRC;
1794
1795        if (out_dev == in_dev && err &&
1796            (IN_DEV_SHARED_MEDIA(out_dev) ||
1797             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1798                flags |= RTCF_DOREDIRECT;
1799
1800        if (skb->protocol != htons(ETH_P_IP)) {
1801                /* Not IP (i.e. ARP). Do not create route, if it is
1802                 * invalid for proxy arp. DNAT routes are always valid.
1803                 */
1804                if (out_dev == in_dev) {
1805                        err = -EINVAL;
1806                        goto cleanup;
1807                }
1808        }
1809
1810
1811        rth = dst_alloc(&ipv4_dst_ops);
1812        if (!rth) {
1813                err = -ENOBUFS;
1814                goto cleanup;
1815        }
1816
1817        atomic_set(&rth->u.dst.__refcnt, 1);
1818        rth->u.dst.flags= DST_HOST;
1819        if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1820                rth->u.dst.flags |= DST_NOPOLICY;
1821        if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1822                rth->u.dst.flags |= DST_NOXFRM;
1823        rth->fl.fl4_dst = daddr;
1824        rth->rt_dst     = daddr;
1825        rth->fl.fl4_tos = tos;
1826        rth->fl.mark    = skb->mark;
1827        rth->fl.fl4_src = saddr;
1828        rth->rt_src     = saddr;
1829        rth->rt_gateway = daddr;
1830        rth->rt_iif     =
1831                rth->fl.iif     = in_dev->dev->ifindex;
1832        rth->u.dst.dev  = (out_dev)->dev;
1833        dev_hold(rth->u.dst.dev);
1834        rth->idev       = in_dev_get(rth->u.dst.dev);
1835        rth->fl.oif     = 0;
1836        rth->rt_spec_dst= spec_dst;
1837
1838        rth->u.dst.input = ip_forward;
1839        rth->u.dst.output = ip_output;
1840        rth->rt_genid = atomic_read(&rt_genid);
1841
1842        rt_set_nexthop(rth, res, itag);
1843
1844        rth->rt_flags = flags;
1845
1846        *result = rth;
1847        err = 0;
1848 cleanup:
1849        /* release the working reference to the output device */
1850        in_dev_put(out_dev);
1851        return err;
1852}
1853
1854static int ip_mkroute_input(struct sk_buff *skb,
1855                            struct fib_result *res,
1856                            const struct flowi *fl,
1857                            struct in_device *in_dev,
1858                            __be32 daddr, __be32 saddr, u32 tos)
1859{
1860        struct rtable* rth = NULL;
1861        int err;
1862        unsigned hash;
1863
1864#ifdef CONFIG_IP_ROUTE_MULTIPATH
1865        if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1866                fib_select_multipath(fl, res);
1867#endif
1868
1869        /* create a routing cache entry */
1870        err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1871        if (err)
1872                return err;
1873
1874        /* put it into the cache */
1875        hash = rt_hash(daddr, saddr, fl->iif);
1876        return rt_intern_hash(hash, rth, &skb->rtable);
1877}
1878
1879/*
1880 *      NOTE. We drop all the packets that has local source
1881 *      addresses, because every properly looped back packet
1882 *      must have correct destination already attached by output routine.
1883 *
1884 *      Such approach solves two big problems:
1885 *      1. Not simplex devices are handled properly.
1886 *      2. IP spoofing attempts are filtered with 100% of guarantee.
1887 */
1888
1889static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1890                               u8 tos, struct net_device *dev)
1891{
1892        struct fib_result res;
1893        struct in_device *in_dev = in_dev_get(dev);
1894        struct flowi fl = { .nl_u = { .ip4_u =
1895                                      { .daddr = daddr,
1896                                        .saddr = saddr,
1897                                        .tos = tos,
1898                                        .scope = RT_SCOPE_UNIVERSE,
1899                                      } },
1900                            .mark = skb->mark,
1901                            .iif = dev->ifindex };
1902        unsigned        flags = 0;
1903        u32             itag = 0;
1904        struct rtable * rth;
1905        unsigned        hash;
1906        __be32          spec_dst;
1907        int             err = -EINVAL;
1908        int             free_res = 0;
1909        struct net    * net = dev_net(dev);
1910
1911        /* IP on this device is disabled. */
1912
1913        if (!in_dev)
1914                goto out;
1915
1916        /* Check for the most weird martians, which can be not detected
1917           by fib_lookup.
1918         */
1919
1920        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1921            ipv4_is_loopback(saddr))
1922                goto martian_source;
1923
1924        if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1925                goto brd_input;
1926
1927        /* Accept zero addresses only to limited broadcast;
1928         * I even do not know to fix it or not. Waiting for complains :-)
1929         */
1930        if (ipv4_is_zeronet(saddr))
1931                goto martian_source;
1932
1933        if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1934            ipv4_is_loopback(daddr))
1935                goto martian_destination;
1936
1937        /*
1938         *      Now we are ready to route packet.
1939         */
1940        if ((err = fib_lookup(net, &fl, &res)) != 0) {
1941                if (!IN_DEV_FORWARD(in_dev))
1942                        goto e_hostunreach;
1943                goto no_route;
1944        }
1945        free_res = 1;
1946
1947        RT_CACHE_STAT_INC(in_slow_tot);
1948
1949        if (res.type == RTN_BROADCAST)
1950                goto brd_input;
1951
1952        if (res.type == RTN_LOCAL) {
1953                int result;
1954                result = fib_validate_source(saddr, daddr, tos,
1955                                             net->loopback_dev->ifindex,
1956                                             dev, &spec_dst, &itag);
1957                if (result < 0)
1958                        goto martian_source;
1959                if (result)
1960                        flags |= RTCF_DIRECTSRC;
1961                spec_dst = daddr;
1962                goto local_input;
1963        }
1964
1965        if (!IN_DEV_FORWARD(in_dev))
1966                goto e_hostunreach;
1967        if (res.type != RTN_UNICAST)
1968                goto martian_destination;
1969
1970        err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1971done:
1972        in_dev_put(in_dev);
1973        if (free_res)
1974                fib_res_put(&res);
1975out:    return err;
1976
1977brd_input:
1978        if (skb->protocol != htons(ETH_P_IP))
1979                goto e_inval;
1980
1981        if (ipv4_is_zeronet(saddr))
1982                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1983        else {
1984                err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1985                                          &itag);
1986                if (err < 0)
1987                        goto martian_source;
1988                if (err)
1989                        flags |= RTCF_DIRECTSRC;
1990        }
1991        flags |= RTCF_BROADCAST;
1992        res.type = RTN_BROADCAST;
1993        RT_CACHE_STAT_INC(in_brd);
1994
1995local_input:
1996        rth = dst_alloc(&ipv4_dst_ops);
1997        if (!rth)
1998                goto e_nobufs;
1999
2000        rth->u.dst.output= ip_rt_bug;
2001        rth->rt_genid = atomic_read(&rt_genid);
2002
2003        atomic_set(&rth->u.dst.__refcnt, 1);
2004        rth->u.dst.flags= DST_HOST;
2005        if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2006                rth->u.dst.flags |= DST_NOPOLICY;
2007        rth->fl.fl4_dst = daddr;
2008        rth->rt_dst     = daddr;
2009        rth->fl.fl4_tos = tos;
2010        rth->fl.mark    = skb->mark;
2011        rth->fl.fl4_src = saddr;
2012        rth->rt_src     = saddr;
2013#ifdef CONFIG_NET_CLS_ROUTE
2014        rth->u.dst.tclassid = itag;
2015#endif
2016        rth->rt_iif     =
2017        rth->fl.iif     = dev->ifindex;
2018        rth->u.dst.dev  = net->loopback_dev;
2019        dev_hold(rth->u.dst.dev);
2020        rth->idev       = in_dev_get(rth->u.dst.dev);
2021        rth->rt_gateway = daddr;
2022        rth->rt_spec_dst= spec_dst;
2023        rth->u.dst.input= ip_local_deliver;
2024        rth->rt_flags   = flags|RTCF_LOCAL;
2025        if (res.type == RTN_UNREACHABLE) {
2026                rth->u.dst.input= ip_error;
2027                rth->u.dst.error= -err;
2028                rth->rt_flags   &= ~RTCF_LOCAL;
2029        }
2030        rth->rt_type    = res.type;
2031        hash = rt_hash(daddr, saddr, fl.iif);
2032        err = rt_intern_hash(hash, rth, &skb->rtable);
2033        goto done;
2034
2035no_route:
2036        RT_CACHE_STAT_INC(in_no_route);
2037        spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2038        res.type = RTN_UNREACHABLE;
2039        if (err == -ESRCH)
2040                err = -ENETUNREACH;
2041        goto local_input;
2042
2043        /*
2044         *      Do not cache martian addresses: they should be logged (RFC1812)
2045         */
2046martian_destination:
2047        RT_CACHE_STAT_INC(in_martian_dst);
2048#ifdef CONFIG_IP_ROUTE_VERBOSE
2049        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2050                printk(KERN_WARNING "martian destination " NIPQUAD_FMT " from "
2051                        NIPQUAD_FMT ", dev %s\n",
2052                        NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2053#endif
2054
2055e_hostunreach:
2056        err = -EHOSTUNREACH;
2057        goto done;
2058
2059e_inval:
2060        err = -EINVAL;
2061        goto done;
2062
2063e_nobufs:
2064        err = -ENOBUFS;
2065        goto done;
2066
2067martian_source:
2068        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2069        goto e_inval;
2070}
2071
2072int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2073                   u8 tos, struct net_device *dev)
2074{
2075        struct rtable * rth;
2076        unsigned        hash;
2077        int iif = dev->ifindex;
2078        struct net *net;
2079
2080        net = dev_net(dev);
2081        tos &= IPTOS_RT_MASK;
2082        hash = rt_hash(daddr, saddr, iif);
2083
2084        rcu_read_lock();
2085        for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2086             rth = rcu_dereference(rth->u.dst.rt_next)) {
2087                if (((rth->fl.fl4_dst ^ daddr) |
2088                     (rth->fl.fl4_src ^ saddr) |
2089                     (rth->fl.iif ^ iif) |
2090                     rth->fl.oif |
2091                     (rth->fl.fl4_tos ^ tos)) == 0 &&
2092                    rth->fl.mark == skb->mark &&
2093                    net_eq(dev_net(rth->u.dst.dev), net) &&
2094                    rth->rt_genid == atomic_read(&rt_genid)) {
2095                        dst_use(&rth->u.dst, jiffies);
2096                        RT_CACHE_STAT_INC(in_hit);
2097                        rcu_read_unlock();
2098                        skb->rtable = rth;
2099                        return 0;
2100                }
2101                RT_CACHE_STAT_INC(in_hlist_search);
2102        }
2103        rcu_read_unlock();
2104
2105        /* Multicast recognition logic is moved from route cache to here.
2106           The problem was that too many Ethernet cards have broken/missing
2107           hardware multicast filters :-( As result the host on multicasting
2108           network acquires a lot of useless route cache entries, sort of
2109           SDR messages from all the world. Now we try to get rid of them.
2110           Really, provided software IP multicast filter is organized
2111           reasonably (at least, hashed), it does not result in a slowdown
2112           comparing with route cache reject entries.
2113           Note, that multicast routers are not affected, because
2114           route cache entry is created eventually.
2115         */
2116        if (ipv4_is_multicast(daddr)) {
2117                struct in_device *in_dev;
2118
2119                rcu_read_lock();
2120                if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2121                        int our = ip_check_mc(in_dev, daddr, saddr,
2122                                ip_hdr(skb)->protocol);
2123                        if (our
2124#ifdef CONFIG_IP_MROUTE
2125                            || (!ipv4_is_local_multicast(daddr) &&
2126                                IN_DEV_MFORWARD(in_dev))
2127#endif
2128                            ) {
2129                                rcu_read_unlock();
2130                                return ip_route_input_mc(skb, daddr, saddr,
2131                                                         tos, dev, our);
2132                        }
2133                }
2134                rcu_read_unlock();
2135                return -EINVAL;
2136        }
2137        return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2138}
2139
2140static int __mkroute_output(struct rtable **result,
2141                            struct fib_result *res,
2142                            const struct flowi *fl,
2143                            const struct flowi *oldflp,
2144                            struct net_device *dev_out,
2145                            unsigned flags)
2146{
2147        struct rtable *rth;
2148        struct in_device *in_dev;
2149        u32 tos = RT_FL_TOS(oldflp);
2150        int err = 0;
2151
2152        if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2153                return -EINVAL;
2154
2155        if (fl->fl4_dst == htonl(0xFFFFFFFF))
2156                res->type = RTN_BROADCAST;
2157        else if (ipv4_is_multicast(fl->fl4_dst))
2158                res->type = RTN_MULTICAST;
2159        else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2160                return -EINVAL;
2161
2162        if (dev_out->flags & IFF_LOOPBACK)
2163                flags |= RTCF_LOCAL;
2164
2165        /* get work reference to inet device */
2166        in_dev = in_dev_get(dev_out);
2167        if (!in_dev)
2168                return -EINVAL;
2169
2170        if (res->type == RTN_BROADCAST) {
2171                flags |= RTCF_BROADCAST | RTCF_LOCAL;
2172                if (res->fi) {
2173                        fib_info_put(res->fi);
2174                        res->fi = NULL;
2175                }
2176        } else if (res->type == RTN_MULTICAST) {
2177                flags |= RTCF_MULTICAST|RTCF_LOCAL;
2178                if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2179                                 oldflp->proto))
2180                        flags &= ~RTCF_LOCAL;
2181                /* If multicast route do not exist use
2182                   default one, but do not gateway in this case.
2183                   Yes, it is hack.
2184                 */
2185                if (res->fi && res->prefixlen < 4) {
2186                        fib_info_put(res->fi);
2187                        res->fi = NULL;
2188                }
2189        }
2190
2191
2192        rth = dst_alloc(&ipv4_dst_ops);
2193        if (!rth) {
2194                err = -ENOBUFS;
2195                goto cleanup;
2196        }
2197
2198        atomic_set(&rth->u.dst.__refcnt, 1);
2199        rth->u.dst.flags= DST_HOST;
2200        if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2201                rth->u.dst.flags |= DST_NOXFRM;
2202        if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2203                rth->u.dst.flags |= DST_NOPOLICY;
2204
2205        rth->fl.fl4_dst = oldflp->fl4_dst;
2206        rth->fl.fl4_tos = tos;
2207        rth->fl.fl4_src = oldflp->fl4_src;
2208        rth->fl.oif     = oldflp->oif;
2209        rth->fl.mark    = oldflp->mark;
2210        rth->rt_dst     = fl->fl4_dst;
2211        rth->rt_src     = fl->fl4_src;
2212        rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2213        /* get references to the devices that are to be hold by the routing
2214           cache entry */
2215        rth->u.dst.dev  = dev_out;
2216        dev_hold(dev_out);
2217        rth->idev       = in_dev_get(dev_out);
2218        rth->rt_gateway = fl->fl4_dst;
2219        rth->rt_spec_dst= fl->fl4_src;
2220
2221        rth->u.dst.output=ip_output;
2222        rth->rt_genid = atomic_read(&rt_genid);
2223
2224        RT_CACHE_STAT_INC(out_slow_tot);
2225
2226        if (flags & RTCF_LOCAL) {
2227                rth->u.dst.input = ip_local_deliver;
2228                rth->rt_spec_dst = fl->fl4_dst;
2229        }
2230        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2231                rth->rt_spec_dst = fl->fl4_src;
2232                if (flags & RTCF_LOCAL &&
2233                    !(dev_out->flags & IFF_LOOPBACK)) {
2234                        rth->u.dst.output = ip_mc_output;
2235                        RT_CACHE_STAT_INC(out_slow_mc);
2236                }
2237#ifdef CONFIG_IP_MROUTE
2238                if (res->type == RTN_MULTICAST) {
2239                        if (IN_DEV_MFORWARD(in_dev) &&
2240                            !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2241                                rth->u.dst.input = ip_mr_input;
2242                                rth->u.dst.output = ip_mc_output;
2243                        }
2244                }
2245#endif
2246        }
2247
2248        rt_set_nexthop(rth, res, 0);
2249
2250        rth->rt_flags = flags;
2251
2252        *result = rth;
2253 cleanup:
2254        /* release work reference to inet device */
2255        in_dev_put(in_dev);
2256
2257        return err;
2258}
2259
2260static int ip_mkroute_output(struct rtable **rp,
2261                             struct fib_result *res,
2262                             const struct flowi *fl,
2263                             const struct flowi *oldflp,
2264                             struct net_device *dev_out,
2265                             unsigned flags)
2266{
2267        struct rtable *rth = NULL;
2268        int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2269        unsigned hash;
2270        if (err == 0) {
2271                hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2272                err = rt_intern_hash(hash, rth, rp);
2273        }
2274
2275        return err;
2276}
2277
2278/*
2279 * Major route resolver routine.
2280 */
2281
2282static int ip_route_output_slow(struct net *net, struct rtable **rp,
2283                                const struct flowi *oldflp)
2284{
2285        u32 tos = RT_FL_TOS(oldflp);
2286        struct flowi fl = { .nl_u = { .ip4_u =
2287                                      { .daddr = oldflp->fl4_dst,
2288                                        .saddr = oldflp->fl4_src,
2289                                        .tos = tos & IPTOS_RT_MASK,
2290                                        .scope = ((tos & RTO_ONLINK) ?
2291                                                  RT_SCOPE_LINK :
2292                                                  RT_SCOPE_UNIVERSE),
2293                                      } },
2294                            .mark = oldflp->mark,
2295                            .iif = net->loopback_dev->ifindex,
2296                            .oif = oldflp->oif };
2297        struct fib_result res;
2298        unsigned flags = 0;
2299        struct net_device *dev_out = NULL;
2300        int free_res = 0;
2301        int err;
2302
2303
2304        res.fi          = NULL;
2305#ifdef CONFIG_IP_MULTIPLE_TABLES
2306        res.r           = NULL;
2307#endif
2308
2309        if (oldflp->fl4_src) {
2310                err = -EINVAL;
2311                if (ipv4_is_multicast(oldflp->fl4_src) ||
2312                    ipv4_is_lbcast(oldflp->fl4_src) ||
2313                    ipv4_is_zeronet(oldflp->fl4_src))
2314                        goto out;
2315
2316                /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2317                dev_out = ip_dev_find(net, oldflp->fl4_src);
2318                if (dev_out == NULL)
2319                        goto out;
2320
2321                /* I removed check for oif == dev_out->oif here.
2322                   It was wrong for two reasons:
2323                   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2324                      is assigned to multiple interfaces.
2325                   2. Moreover, we are allowed to send packets with saddr
2326                      of another iface. --ANK
2327                 */
2328
2329                if (oldflp->oif == 0
2330                    && (ipv4_is_multicast(oldflp->fl4_dst) ||
2331                        oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2332                        /* Special hack: user can direct multicasts
2333                           and limited broadcast via necessary interface
2334                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2335                           This hack is not just for fun, it allows
2336                           vic,vat and friends to work.
2337                           They bind socket to loopback, set ttl to zero
2338                           and expect that it will work.
2339                           From the viewpoint of routing cache they are broken,
2340                           because we are not allowed to build multicast path
2341                           with loopback source addr (look, routing cache
2342                           cannot know, that ttl is zero, so that packet
2343                           will not leave this host and route is valid).
2344                           Luckily, this hack is good workaround.
2345                         */
2346
2347                        fl.oif = dev_out->ifindex;
2348                        goto make_route;
2349                }
2350                if (dev_out)
2351                        dev_put(dev_out);
2352                dev_out = NULL;
2353        }
2354
2355
2356        if (oldflp->oif) {
2357                dev_out = dev_get_by_index(net, oldflp->oif);
2358                err = -ENODEV;
2359                if (dev_out == NULL)
2360                        goto out;
2361
2362                /* RACE: Check return value of inet_select_addr instead. */
2363                if (__in_dev_get_rtnl(dev_out) == NULL) {
2364                        dev_put(dev_out);
2365                        goto out;       /* Wrong error code */
2366                }
2367
2368                if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2369                    oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2370                        if (!fl.fl4_src)
2371                                fl.fl4_src = inet_select_addr(dev_out, 0,
2372                                                              RT_SCOPE_LINK);
2373                        goto make_route;
2374                }
2375                if (!fl.fl4_src) {
2376                        if (ipv4_is_multicast(oldflp->fl4_dst))
2377                                fl.fl4_src = inet_select_addr(dev_out, 0,
2378                                                              fl.fl4_scope);
2379                        else if (!oldflp->fl4_dst)
2380                                fl.fl4_src = inet_select_addr(dev_out, 0,
2381                                                              RT_SCOPE_HOST);
2382                }
2383        }
2384
2385        if (!fl.fl4_dst) {
2386                fl.fl4_dst = fl.fl4_src;
2387                if (!fl.fl4_dst)
2388                        fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2389                if (dev_out)
2390                        dev_put(dev_out);
2391                dev_out = net->loopback_dev;
2392                dev_hold(dev_out);
2393                fl.oif = net->loopback_dev->ifindex;
2394                res.type = RTN_LOCAL;
2395                flags |= RTCF_LOCAL;
2396                goto make_route;
2397        }
2398
2399        if (fib_lookup(net, &fl, &res)) {
2400                res.fi = NULL;
2401                if (oldflp->oif) {
2402                        /* Apparently, routing tables are wrong. Assume,
2403                           that the destination is on link.
2404
2405                           WHY? DW.
2406                           Because we are allowed to send to iface
2407                           even if it has NO routes and NO assigned
2408                           addresses. When oif is specified, routing
2409                           tables are looked up with only one purpose:
2410                           to catch if destination is gatewayed, rather than
2411                           direct. Moreover, if MSG_DONTROUTE is set,
2412                           we send packet, ignoring both routing tables
2413                           and ifaddr state. --ANK
2414
2415
2416                           We could make it even if oif is unknown,
2417                           likely IPv6, but we do not.
2418                         */
2419
2420                        if (fl.fl4_src == 0)
2421                                fl.fl4_src = inet_select_addr(dev_out, 0,
2422                                                              RT_SCOPE_LINK);
2423                        res.type = RTN_UNICAST;
2424                        goto make_route;
2425                }
2426                if (dev_out)
2427                        dev_put(dev_out);
2428                err = -ENETUNREACH;
2429                goto out;
2430        }
2431        free_res = 1;
2432
2433        if (res.type == RTN_LOCAL) {
2434                if (!fl.fl4_src)
2435                        fl.fl4_src = fl.fl4_dst;
2436                if (dev_out)
2437                        dev_put(dev_out);
2438                dev_out = net->loopback_dev;
2439                dev_hold(dev_out);
2440                fl.oif = dev_out->ifindex;
2441                if (res.fi)
2442                        fib_info_put(res.fi);
2443                res.fi = NULL;
2444                flags |= RTCF_LOCAL;
2445                goto make_route;
2446        }
2447
2448#ifdef CONFIG_IP_ROUTE_MULTIPATH
2449        if (res.fi->fib_nhs > 1 && fl.oif == 0)
2450                fib_select_multipath(&fl, &res);
2451        else
2452#endif
2453        if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2454                fib_select_default(net, &fl, &res);
2455
2456        if (!fl.fl4_src)
2457                fl.fl4_src = FIB_RES_PREFSRC(res);
2458
2459        if (dev_out)
2460                dev_put(dev_out);
2461        dev_out = FIB_RES_DEV(res);
2462        dev_hold(dev_out);
2463        fl.oif = dev_out->ifindex;
2464
2465
2466make_route:
2467        err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2468
2469
2470        if (free_res)
2471                fib_res_put(&res);
2472        if (dev_out)
2473                dev_put(dev_out);
2474out:    return err;
2475}
2476
2477int __ip_route_output_key(struct net *net, struct rtable **rp,
2478                          const struct flowi *flp)
2479{
2480        unsigned hash;
2481        struct rtable *rth;
2482
2483        hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2484
2485        rcu_read_lock_bh();
2486        for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2487                rth = rcu_dereference(rth->u.dst.rt_next)) {
2488                if (rth->fl.fl4_dst == flp->fl4_dst &&
2489                    rth->fl.fl4_src == flp->fl4_src &&
2490                    rth->fl.iif == 0 &&
2491                    rth->fl.oif == flp->oif &&
2492                    rth->fl.mark == flp->mark &&
2493                    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2494                            (IPTOS_RT_MASK | RTO_ONLINK)) &&
2495                    net_eq(dev_net(rth->u.dst.dev), net) &&
2496                    rth->rt_genid == atomic_read(&rt_genid)) {
2497                        dst_use(&rth->u.dst, jiffies);
2498                        RT_CACHE_STAT_INC(out_hit);
2499                        rcu_read_unlock_bh();
2500                        *rp = rth;
2501                        return 0;
2502                }
2503                RT_CACHE_STAT_INC(out_hlist_search);
2504        }
2505        rcu_read_unlock_bh();
2506
2507        return ip_route_output_slow(net, rp, flp);
2508}
2509
2510EXPORT_SYMBOL_GPL(__ip_route_output_key);
2511
2512static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2513{
2514}
2515
2516static struct dst_ops ipv4_dst_blackhole_ops = {
2517        .family                 =       AF_INET,
2518        .protocol               =       __constant_htons(ETH_P_IP),
2519        .destroy                =       ipv4_dst_destroy,
2520        .check                  =       ipv4_dst_check,
2521        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2522        .entry_size             =       sizeof(struct rtable),
2523        .entries                =       ATOMIC_INIT(0),
2524};
2525
2526
2527static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp)
2528{
2529        struct rtable *ort = *rp;
2530        struct rtable *rt = (struct rtable *)
2531                dst_alloc(&ipv4_dst_blackhole_ops);
2532
2533        if (rt) {
2534                struct dst_entry *new = &rt->u.dst;
2535
2536                atomic_set(&new->__refcnt, 1);
2537                new->__use = 1;
2538                new->input = dst_discard;
2539                new->output = dst_discard;
2540                memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2541
2542                new->dev = ort->u.dst.dev;
2543                if (new->dev)
2544                        dev_hold(new->dev);
2545
2546                rt->fl = ort->fl;
2547
2548                rt->idev = ort->idev;
2549                if (rt->idev)
2550                        in_dev_hold(rt->idev);
2551                rt->rt_genid = atomic_read(&rt_genid);
2552                rt->rt_flags = ort->rt_flags;
2553                rt->rt_type = ort->rt_type;
2554                rt->rt_dst = ort->rt_dst;
2555                rt->rt_src = ort->rt_src;
2556                rt->rt_iif = ort->rt_iif;
2557                rt->rt_gateway = ort->rt_gateway;
2558                rt->rt_spec_dst = ort->rt_spec_dst;
2559                rt->peer = ort->peer;
2560                if (rt->peer)
2561                        atomic_inc(&rt->peer->refcnt);
2562
2563                dst_free(new);
2564        }
2565
2566        dst_release(&(*rp)->u.dst);
2567        *rp = rt;
2568        return (rt ? 0 : -ENOMEM);
2569}
2570
2571int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2572                         struct sock *sk, int flags)
2573{
2574        int err;
2575
2576        if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2577                return err;
2578
2579        if (flp->proto) {
2580                if (!flp->fl4_src)
2581                        flp->fl4_src = (*rp)->rt_src;
2582                if (!flp->fl4_dst)
2583                        flp->fl4_dst = (*rp)->rt_dst;
2584                err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2585                                    flags ? XFRM_LOOKUP_WAIT : 0);
2586                if (err == -EREMOTE)
2587                        err = ipv4_dst_blackhole(rp, flp);
2588
2589                return err;
2590        }
2591
2592        return 0;
2593}
2594
2595EXPORT_SYMBOL_GPL(ip_route_output_flow);
2596
2597int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2598{
2599        return ip_route_output_flow(net, rp, flp, NULL, 0);
2600}
2601
2602static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2603                        int nowait, unsigned int flags)
2604{
2605        struct rtable *rt = skb->rtable;
2606        struct rtmsg *r;
2607        struct nlmsghdr *nlh;
2608        long expires;
2609        u32 id = 0, ts = 0, tsage = 0, error;
2610
2611        nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2612        if (nlh == NULL)
2613                return -EMSGSIZE;
2614
2615        r = nlmsg_data(nlh);
2616        r->rtm_family    = AF_INET;
2617        r->rtm_dst_len  = 32;
2618        r->rtm_src_len  = 0;
2619        r->rtm_tos      = rt->fl.fl4_tos;
2620        r->rtm_table    = RT_TABLE_MAIN;
2621        NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2622        r->rtm_type     = rt->rt_type;
2623        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2624        r->rtm_protocol = RTPROT_UNSPEC;
2625        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2626        if (rt->rt_flags & RTCF_NOTIFY)
2627                r->rtm_flags |= RTM_F_NOTIFY;
2628
2629        NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2630
2631        if (rt->fl.fl4_src) {
2632                r->rtm_src_len = 32;
2633                NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2634        }
2635        if (rt->u.dst.dev)
2636                NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2637#ifdef CONFIG_NET_CLS_ROUTE
2638        if (rt->u.dst.tclassid)
2639                NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2640#endif
2641        if (rt->fl.iif)
2642                NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2643        else if (rt->rt_src != rt->fl.fl4_src)
2644                NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2645
2646        if (rt->rt_dst != rt->rt_gateway)
2647                NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2648
2649        if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2650                goto nla_put_failure;
2651
2652        error = rt->u.dst.error;
2653        expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2654        if (rt->peer) {
2655                id = rt->peer->ip_id_count;
2656                if (rt->peer->tcp_ts_stamp) {
2657                        ts = rt->peer->tcp_ts;
2658                        tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2659                }
2660        }
2661
2662        if (rt->fl.iif) {
2663#ifdef CONFIG_IP_MROUTE
2664                __be32 dst = rt->rt_dst;
2665
2666                if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2667                    IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2668                        int err = ipmr_get_route(skb, r, nowait);
2669                        if (err <= 0) {
2670                                if (!nowait) {
2671                                        if (err == 0)
2672                                                return 0;
2673                                        goto nla_put_failure;
2674                                } else {
2675                                        if (err == -EMSGSIZE)
2676                                                goto nla_put_failure;
2677                                        error = err;
2678                                }
2679                        }
2680                } else
2681#endif
2682                        NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2683        }
2684
2685        if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2686                               expires, error) < 0)
2687                goto nla_put_failure;
2688
2689        return nlmsg_end(skb, nlh);
2690
2691nla_put_failure:
2692        nlmsg_cancel(skb, nlh);
2693        return -EMSGSIZE;
2694}
2695
2696static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2697{
2698        struct net *net = sock_net(in_skb->sk);
2699        struct rtmsg *rtm;
2700        struct nlattr *tb[RTA_MAX+1];
2701        struct rtable *rt = NULL;
2702        __be32 dst = 0;
2703        __be32 src = 0;
2704        u32 iif;
2705        int err;
2706        struct sk_buff *skb;
2707
2708        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2709        if (err < 0)
2710                goto errout;
2711
2712        rtm = nlmsg_data(nlh);
2713
2714        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2715        if (skb == NULL) {
2716                err = -ENOBUFS;
2717                goto errout;
2718        }
2719
2720        /* Reserve room for dummy headers, this skb can pass
2721           through good chunk of routing engine.
2722         */
2723        skb_reset_mac_header(skb);
2724        skb_reset_network_header(skb);
2725
2726        /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2727        ip_hdr(skb)->protocol = IPPROTO_ICMP;
2728        skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2729
2730        src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2731        dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2732        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2733
2734        if (iif) {
2735                struct net_device *dev;
2736
2737                dev = __dev_get_by_index(net, iif);
2738                if (dev == NULL) {
2739                        err = -ENODEV;
2740                        goto errout_free;
2741                }
2742
2743                skb->protocol   = htons(ETH_P_IP);
2744                skb->dev        = dev;
2745                local_bh_disable();
2746                err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2747                local_bh_enable();
2748
2749                rt = skb->rtable;
2750                if (err == 0 && rt->u.dst.error)
2751                        err = -rt->u.dst.error;
2752        } else {
2753                struct flowi fl = {
2754                        .nl_u = {
2755                                .ip4_u = {
2756                                        .daddr = dst,
2757                                        .saddr = src,
2758                                        .tos = rtm->rtm_tos,
2759                                },
2760                        },
2761                        .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2762                };
2763                err = ip_route_output_key(net, &rt, &fl);
2764        }
2765
2766        if (err)
2767                goto errout_free;
2768
2769        skb->rtable = rt;
2770        if (rtm->rtm_flags & RTM_F_NOTIFY)
2771                rt->rt_flags |= RTCF_NOTIFY;
2772
2773        err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2774                           RTM_NEWROUTE, 0, 0);
2775        if (err <= 0)
2776                goto errout_free;
2777
2778        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2779errout:
2780        return err;
2781
2782errout_free:
2783        kfree_skb(skb);
2784        goto errout;
2785}
2786
2787int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2788{
2789        struct rtable *rt;
2790        int h, s_h;
2791        int idx, s_idx;
2792        struct net *net;
2793
2794        net = sock_net(skb->sk);
2795
2796        s_h = cb->args[0];
2797        if (s_h < 0)
2798                s_h = 0;
2799        s_idx = idx = cb->args[1];
2800        for (h = s_h; h <= rt_hash_mask; h++) {
2801                rcu_read_lock_bh();
2802                for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2803                     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2804                        if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
2805                                continue;
2806                        if (rt->rt_genid != atomic_read(&rt_genid))
2807                                continue;
2808                        skb->dst = dst_clone(&rt->u.dst);
2809                        if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2810                                         cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2811                                         1, NLM_F_MULTI) <= 0) {
2812                                dst_release(xchg(&skb->dst, NULL));
2813                                rcu_read_unlock_bh();
2814                                goto done;
2815                        }
2816                        dst_release(xchg(&skb->dst, NULL));
2817                }
2818                rcu_read_unlock_bh();
2819                s_idx = 0;
2820        }
2821
2822done:
2823        cb->args[0] = h;
2824        cb->args[1] = idx;
2825        return skb->len;
2826}
2827
2828void ip_rt_multicast_event(struct in_device *in_dev)
2829{
2830        rt_cache_flush(0);
2831}
2832
2833#ifdef CONFIG_SYSCTL
2834static int flush_delay;
2835
2836static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2837                                        struct file *filp, void __user *buffer,
2838                                        size_t *lenp, loff_t *ppos)
2839{
2840        if (write) {
2841                proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2842                rt_cache_flush(flush_delay);
2843                return 0;
2844        }
2845
2846        return -EINVAL;
2847}
2848
2849static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2850                                                int __user *name,
2851                                                int nlen,
2852                                                void __user *oldval,
2853                                                size_t __user *oldlenp,
2854                                                void __user *newval,
2855                                                size_t newlen)
2856{
2857        int delay;
2858        if (newlen != sizeof(int))
2859                return -EINVAL;
2860        if (get_user(delay, (int __user *)newval))
2861                return -EFAULT;
2862        rt_cache_flush(delay);
2863        return 0;
2864}
2865
2866ctl_table ipv4_route_table[] = {
2867        {
2868                .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2869                .procname       = "flush",
2870                .data           = &flush_delay,
2871                .maxlen         = sizeof(int),
2872                .mode           = 0200,
2873                .proc_handler   = &ipv4_sysctl_rtcache_flush,
2874                .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2875        },
2876        {
2877                .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2878                .procname       = "gc_thresh",
2879                .data           = &ipv4_dst_ops.gc_thresh,
2880                .maxlen         = sizeof(int),
2881                .mode           = 0644,
2882                .proc_handler   = &proc_dointvec,
2883        },
2884        {
2885                .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2886                .procname       = "max_size",
2887                .data           = &ip_rt_max_size,
2888                .maxlen         = sizeof(int),
2889                .mode           = 0644,
2890                .proc_handler   = &proc_dointvec,
2891        },
2892        {
2893                /*  Deprecated. Use gc_min_interval_ms */
2894
2895                .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2896                .procname       = "gc_min_interval",
2897                .data           = &ip_rt_gc_min_interval,
2898                .maxlen         = sizeof(int),
2899                .mode           = 0644,
2900                .proc_handler   = &proc_dointvec_jiffies,
2901                .strategy       = &sysctl_jiffies,
2902        },
2903        {
2904                .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2905                .procname       = "gc_min_interval_ms",
2906                .data           = &ip_rt_gc_min_interval,
2907                .maxlen         = sizeof(int),
2908                .mode           = 0644,
2909                .proc_handler   = &proc_dointvec_ms_jiffies,
2910                .strategy       = &sysctl_ms_jiffies,
2911        },
2912        {
2913                .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2914                .procname       = "gc_timeout",
2915                .data           = &ip_rt_gc_timeout,
2916                .maxlen         = sizeof(int),
2917                .mode           = 0644,
2918                .proc_handler   = &proc_dointvec_jiffies,
2919                .strategy       = &sysctl_jiffies,
2920        },
2921        {
2922                .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2923                .procname       = "gc_interval",
2924                .data           = &ip_rt_gc_interval,
2925                .maxlen         = sizeof(int),
2926                .mode           = 0644,
2927                .proc_handler   = &proc_dointvec_jiffies,
2928                .strategy       = &sysctl_jiffies,
2929        },
2930        {
2931                .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2932                .procname       = "redirect_load",
2933                .data           = &ip_rt_redirect_load,
2934                .maxlen         = sizeof(int),
2935                .mode           = 0644,
2936                .proc_handler   = &proc_dointvec,
2937        },
2938        {
2939                .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2940                .procname       = "redirect_number",
2941                .data           = &ip_rt_redirect_number,
2942                .maxlen         = sizeof(int),
2943                .mode           = 0644,
2944                .proc_handler   = &proc_dointvec,
2945        },
2946        {
2947                .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2948                .procname       = "redirect_silence",
2949                .data           = &ip_rt_redirect_silence,
2950                .maxlen         = sizeof(int),
2951                .mode           = 0644,
2952                .proc_handler   = &proc_dointvec,
2953        },
2954        {
2955                .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2956                .procname       = "error_cost",
2957                .data           = &ip_rt_error_cost,
2958                .maxlen         = sizeof(int),
2959                .mode           = 0644,
2960                .proc_handler   = &proc_dointvec,
2961        },
2962        {
2963                .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2964                .procname       = "error_burst",
2965                .data           = &ip_rt_error_burst,
2966                .maxlen         = sizeof(int),
2967                .mode           = 0644,
2968                .proc_handler   = &proc_dointvec,
2969        },
2970        {
2971                .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2972                .procname       = "gc_elasticity",
2973                .data           = &ip_rt_gc_elasticity,
2974                .maxlen         = sizeof(int),
2975                .mode           = 0644,
2976                .proc_handler   = &proc_dointvec,
2977        },
2978        {
2979                .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2980                .procname       = "mtu_expires",
2981                .data           = &ip_rt_mtu_expires,
2982                .maxlen         = sizeof(int),
2983                .mode           = 0644,
2984                .proc_handler   = &proc_dointvec_jiffies,
2985                .strategy       = &sysctl_jiffies,
2986        },
2987        {
2988                .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2989                .procname       = "min_pmtu",
2990                .data           = &ip_rt_min_pmtu,
2991                .maxlen         = sizeof(int),
2992                .mode           = 0644,
2993                .proc_handler   = &proc_dointvec,
2994        },
2995        {
2996                .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2997                .procname       = "min_adv_mss",
2998                .data           = &ip_rt_min_advmss,
2999                .maxlen         = sizeof(int),
3000                .mode           = 0644,
3001                .proc_handler   = &proc_dointvec,
3002        },
3003        {
3004                .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3005                .procname       = "secret_interval",
3006                .data           = &ip_rt_secret_interval,
3007                .maxlen         = sizeof(int),
3008                .mode           = 0644,
3009                .proc_handler   = &proc_dointvec_jiffies,
3010                .strategy       = &sysctl_jiffies,
3011        },
3012        { .ctl_name = 0 }
3013};
3014#endif
3015
3016#ifdef CONFIG_NET_CLS_ROUTE
3017struct ip_rt_acct *ip_rt_acct __read_mostly;
3018#endif /* CONFIG_NET_CLS_ROUTE */
3019
3020static __initdata unsigned long rhash_entries;
3021static int __init set_rhash_entries(char *str)
3022{
3023        if (!str)
3024                return 0;
3025        rhash_entries = simple_strtoul(str, &str, 0);
3026        return 1;
3027}
3028__setup("rhash_entries=", set_rhash_entries);
3029
3030int __init ip_rt_init(void)
3031{
3032        int rc = 0;
3033
3034        atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3035                             (jiffies ^ (jiffies >> 7))));
3036
3037#ifdef CONFIG_NET_CLS_ROUTE
3038        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3039        if (!ip_rt_acct)
3040                panic("IP: failed to allocate ip_rt_acct\n");
3041#endif
3042
3043        ipv4_dst_ops.kmem_cachep =
3044                kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3045                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3046
3047        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3048
3049        rt_hash_table = (struct rt_hash_bucket *)
3050                alloc_large_system_hash("IP route cache",
3051                                        sizeof(struct rt_hash_bucket),
3052                                        rhash_entries,
3053                                        (num_physpages >= 128 * 1024) ?
3054                                        15 : 17,
3055                                        0,
3056                                        &rt_hash_log,
3057                                        &rt_hash_mask,
3058                                        0);
3059        memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3060        rt_hash_lock_init();
3061
3062        ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3063        ip_rt_max_size = (rt_hash_mask + 1) * 16;
3064
3065        devinet_init();
3066        ip_fib_init();
3067
3068        rt_secret_timer.function = rt_secret_rebuild;
3069        rt_secret_timer.data = 0;
3070        init_timer_deferrable(&rt_secret_timer);
3071
3072        /* All the timers, started at system startup tend
3073           to synchronize. Perturb it a bit.
3074         */
3075        schedule_delayed_work(&expires_work,
3076                net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3077
3078        rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3079                ip_rt_secret_interval;
3080        add_timer(&rt_secret_timer);
3081
3082        if (ip_rt_proc_init())
3083                printk(KERN_ERR "Unable to create route proc files\n");
3084#ifdef CONFIG_XFRM
3085        xfrm_init();
3086        xfrm4_init();
3087#endif
3088        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3089
3090        return rc;
3091}
3092
3093EXPORT_SYMBOL(__ip_select_ident);
3094EXPORT_SYMBOL(ip_route_input);
3095EXPORT_SYMBOL(ip_route_output_key);
3096
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.