linux/net/ipv4/route.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              ROUTE - implementation of the IP router.
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13 *
  14 * Fixes:
  15 *              Alan Cox        :       Verify area fixes.
  16 *              Alan Cox        :       cli() protects routing changes
  17 *              Rui Oliveira    :       ICMP routing table updates
  18 *              (rco@di.uminho.pt)      Routing table insertion and update
  19 *              Linus Torvalds  :       Rewrote bits to be sensible
  20 *              Alan Cox        :       Added BSD route gw semantics
  21 *              Alan Cox        :       Super /proc >4K
  22 *              Alan Cox        :       MTU in route table
  23 *              Alan Cox        :       MSS actually. Also added the window
  24 *                                      clamper.
  25 *              Sam Lantinga    :       Fixed route matching in rt_del()
  26 *              Alan Cox        :       Routing cache support.
  27 *              Alan Cox        :       Removed compatibility cruft.
  28 *              Alan Cox        :       RTF_REJECT support.
  29 *              Alan Cox        :       TCP irtt support.
  30 *              Jonathan Naylor :       Added Metric support.
  31 *      Miquel van Smoorenburg  :       BSD API fixes.
  32 *      Miquel van Smoorenburg  :       Metrics.
  33 *              Alan Cox        :       Use __u32 properly
  34 *              Alan Cox        :       Aligned routing errors more closely with BSD
  35 *                                      our system is still very different.
  36 *              Alan Cox        :       Faster /proc handling
  37 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38 *                                      routing caches and better behaviour.
  39 *
  40 *              Olaf Erb        :       irtt wasn't being copied right.
  41 *              Bjorn Ekwall    :       Kerneld route support.
  42 *              Alan Cox        :       Multicast fixed (I hope)
  43 *              Pavel Krauz     :       Limited broadcast fixed
  44 *              Mike McLagan    :       Routing by source
  45 *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46 *                                      route.c and rewritten from scratch.
  47 *              Andi Kleen      :       Load-limit warning messages.
  48 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52 *              Marc Boucher    :       routing by fwmark
  53 *      Robert Olsson           :       Added rt_cache statistics
  54 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55 *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56 *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57 *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58 *
  59 *              This program is free software; you can redistribute it and/or
  60 *              modify it under the terms of the GNU General Public License
  61 *              as published by the Free Software Foundation; either version
  62 *              2 of the License, or (at your option) any later version.
  63 */
  64
  65#include <linux/module.h>
  66#include <asm/uaccess.h>
  67#include <asm/system.h>
  68#include <linux/bitops.h>
  69#include <linux/types.h>
  70#include <linux/kernel.h>
  71#include <linux/mm.h>
  72#include <linux/bootmem.h>
  73#include <linux/string.h>
  74#include <linux/socket.h>
  75#include <linux/sockios.h>
  76#include <linux/errno.h>
  77#include <linux/in.h>
  78#include <linux/inet.h>
  79#include <linux/netdevice.h>
  80#include <linux/proc_fs.h>
  81#include <linux/init.h>
  82#include <linux/workqueue.h>
  83#include <linux/skbuff.h>
  84#include <linux/inetdevice.h>
  85#include <linux/igmp.h>
  86#include <linux/pkt_sched.h>
  87#include <linux/mroute.h>
  88#include <linux/netfilter_ipv4.h>
  89#include <linux/random.h>
  90#include <linux/jhash.h>
  91#include <linux/rcupdate.h>
  92#include <linux/times.h>
  93#include <net/dst.h>
  94#include <net/net_namespace.h>
  95#include <net/protocol.h>
  96#include <net/ip.h>
  97#include <net/route.h>
  98#include <net/inetpeer.h>
  99#include <net/sock.h>
 100#include <net/ip_fib.h>
 101#include <net/arp.h>
 102#include <net/tcp.h>
 103#include <net/icmp.h>
 104#include <net/xfrm.h>
 105#include <net/netevent.h>
 106#include <net/rtnetlink.h>
 107#ifdef CONFIG_SYSCTL
 108#include <linux/sysctl.h>
 109#endif
 110
 111#define RT_FL_TOS(oldflp) \
 112    ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 113
 114#define IP_MAX_MTU      0xFFF0
 115
 116#define RT_GC_TIMEOUT (300*HZ)
 117
 118static int ip_rt_max_size;
 119static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 120static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 121static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 122static int ip_rt_redirect_number __read_mostly  = 9;
 123static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 124static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 125static int ip_rt_error_cost __read_mostly       = HZ;
 126static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 127static int ip_rt_gc_elasticity __read_mostly    = 8;
 128static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 129static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 130static int ip_rt_min_advmss __read_mostly       = 256;
 131static int ip_rt_secret_interval __read_mostly  = 10 * 60 * HZ;
 132static int rt_chain_length_max __read_mostly    = 20;
 133
 134static struct delayed_work expires_work;
 135static unsigned long expires_ljiffies;
 136
 137/*
 138 *      Interface to generic destination cache.
 139 */
 140
 141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 142static void              ipv4_dst_destroy(struct dst_entry *dst);
 143static void              ipv4_dst_ifdown(struct dst_entry *dst,
 144                                         struct net_device *dev, int how);
 145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 146static void              ipv4_link_failure(struct sk_buff *skb);
 147static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 148static int rt_garbage_collect(struct dst_ops *ops);
 149static void rt_emergency_hash_rebuild(struct net *net);
 150
 151
 152static struct dst_ops ipv4_dst_ops = {
 153        .family =               AF_INET,
 154        .protocol =             cpu_to_be16(ETH_P_IP),
 155        .gc =                   rt_garbage_collect,
 156        .check =                ipv4_dst_check,
 157        .destroy =              ipv4_dst_destroy,
 158        .ifdown =               ipv4_dst_ifdown,
 159        .negative_advice =      ipv4_negative_advice,
 160        .link_failure =         ipv4_link_failure,
 161        .update_pmtu =          ip_rt_update_pmtu,
 162        .local_out =            __ip_local_out,
 163        .entries =              ATOMIC_INIT(0),
 164};
 165
 166#define ECN_OR_COST(class)      TC_PRIO_##class
 167
 168const __u8 ip_tos2prio[16] = {
 169        TC_PRIO_BESTEFFORT,
 170        ECN_OR_COST(FILLER),
 171        TC_PRIO_BESTEFFORT,
 172        ECN_OR_COST(BESTEFFORT),
 173        TC_PRIO_BULK,
 174        ECN_OR_COST(BULK),
 175        TC_PRIO_BULK,
 176        ECN_OR_COST(BULK),
 177        TC_PRIO_INTERACTIVE,
 178        ECN_OR_COST(INTERACTIVE),
 179        TC_PRIO_INTERACTIVE,
 180        ECN_OR_COST(INTERACTIVE),
 181        TC_PRIO_INTERACTIVE_BULK,
 182        ECN_OR_COST(INTERACTIVE_BULK),
 183        TC_PRIO_INTERACTIVE_BULK,
 184        ECN_OR_COST(INTERACTIVE_BULK)
 185};
 186
 187
 188/*
 189 * Route cache.
 190 */
 191
 192/* The locking scheme is rather straight forward:
 193 *
 194 * 1) Read-Copy Update protects the buckets of the central route hash.
 195 * 2) Only writers remove entries, and they hold the lock
 196 *    as they look at rtable reference counts.
 197 * 3) Only readers acquire references to rtable entries,
 198 *    they do so with atomic increments and with the
 199 *    lock held.
 200 */
 201
 202struct rt_hash_bucket {
 203        struct rtable   *chain;
 204};
 205
 206#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 207        defined(CONFIG_PROVE_LOCKING)
 208/*
 209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 210 * The size of this table is a power of two and depends on the number of CPUS.
 211 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 212 */
 213#ifdef CONFIG_LOCKDEP
 214# define RT_HASH_LOCK_SZ        256
 215#else
 216# if NR_CPUS >= 32
 217#  define RT_HASH_LOCK_SZ       4096
 218# elif NR_CPUS >= 16
 219#  define RT_HASH_LOCK_SZ       2048
 220# elif NR_CPUS >= 8
 221#  define RT_HASH_LOCK_SZ       1024
 222# elif NR_CPUS >= 4
 223#  define RT_HASH_LOCK_SZ       512
 224# else
 225#  define RT_HASH_LOCK_SZ       256
 226# endif
 227#endif
 228
 229static spinlock_t       *rt_hash_locks;
 230# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 231
 232static __init void rt_hash_lock_init(void)
 233{
 234        int i;
 235
 236        rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 237                        GFP_KERNEL);
 238        if (!rt_hash_locks)
 239                panic("IP: failed to allocate rt_hash_locks\n");
 240
 241        for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 242                spin_lock_init(&rt_hash_locks[i]);
 243}
 244#else
 245# define rt_hash_lock_addr(slot) NULL
 246
 247static inline void rt_hash_lock_init(void)
 248{
 249}
 250#endif
 251
 252static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 253static unsigned                 rt_hash_mask __read_mostly;
 254static unsigned int             rt_hash_log  __read_mostly;
 255
 256static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 257#define RT_CACHE_STAT_INC(field) \
 258        (__raw_get_cpu_var(rt_cache_stat).field++)
 259
 260static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 261                int genid)
 262{
 263        return jhash_3words((__force u32)(__be32)(daddr),
 264                            (__force u32)(__be32)(saddr),
 265                            idx, genid)
 266                & rt_hash_mask;
 267}
 268
 269static inline int rt_genid(struct net *net)
 270{
 271        return atomic_read(&net->ipv4.rt_genid);
 272}
 273
 274#ifdef CONFIG_PROC_FS
 275struct rt_cache_iter_state {
 276        struct seq_net_private p;
 277        int bucket;
 278        int genid;
 279};
 280
 281static struct rtable *rt_cache_get_first(struct seq_file *seq)
 282{
 283        struct rt_cache_iter_state *st = seq->private;
 284        struct rtable *r = NULL;
 285
 286        for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 287                if (!rt_hash_table[st->bucket].chain)
 288                        continue;
 289                rcu_read_lock_bh();
 290                r = rcu_dereference(rt_hash_table[st->bucket].chain);
 291                while (r) {
 292                        if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
 293                            r->rt_genid == st->genid)
 294                                return r;
 295                        r = rcu_dereference(r->u.dst.rt_next);
 296                }
 297                rcu_read_unlock_bh();
 298        }
 299        return r;
 300}
 301
 302static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 303                                          struct rtable *r)
 304{
 305        struct rt_cache_iter_state *st = seq->private;
 306
 307        r = r->u.dst.rt_next;
 308        while (!r) {
 309                rcu_read_unlock_bh();
 310                do {
 311                        if (--st->bucket < 0)
 312                                return NULL;
 313                } while (!rt_hash_table[st->bucket].chain);
 314                rcu_read_lock_bh();
 315                r = rt_hash_table[st->bucket].chain;
 316        }
 317        return rcu_dereference(r);
 318}
 319
 320static struct rtable *rt_cache_get_next(struct seq_file *seq,
 321                                        struct rtable *r)
 322{
 323        struct rt_cache_iter_state *st = seq->private;
 324        while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 325                if (dev_net(r->u.dst.dev) != seq_file_net(seq))
 326                        continue;
 327                if (r->rt_genid == st->genid)
 328                        break;
 329        }
 330        return r;
 331}
 332
 333static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 334{
 335        struct rtable *r = rt_cache_get_first(seq);
 336
 337        if (r)
 338                while (pos && (r = rt_cache_get_next(seq, r)))
 339                        --pos;
 340        return pos ? NULL : r;
 341}
 342
 343static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 344{
 345        struct rt_cache_iter_state *st = seq->private;
 346        if (*pos)
 347                return rt_cache_get_idx(seq, *pos - 1);
 348        st->genid = rt_genid(seq_file_net(seq));
 349        return SEQ_START_TOKEN;
 350}
 351
 352static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 353{
 354        struct rtable *r;
 355
 356        if (v == SEQ_START_TOKEN)
 357                r = rt_cache_get_first(seq);
 358        else
 359                r = rt_cache_get_next(seq, v);
 360        ++*pos;
 361        return r;
 362}
 363
 364static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 365{
 366        if (v && v != SEQ_START_TOKEN)
 367                rcu_read_unlock_bh();
 368}
 369
 370static int rt_cache_seq_show(struct seq_file *seq, void *v)
 371{
 372        if (v == SEQ_START_TOKEN)
 373                seq_printf(seq, "%-127s\n",
 374                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 375                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 376                           "HHUptod\tSpecDst");
 377        else {
 378                struct rtable *r = v;
 379                int len;
 380
 381                seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 382                              "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 383                        r->u.dst.dev ? r->u.dst.dev->name : "*",
 384                        (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 385                        r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 386                        r->u.dst.__use, 0, (unsigned long)r->rt_src,
 387                        (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 388                             (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 389                        dst_metric(&r->u.dst, RTAX_WINDOW),
 390                        (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 391                              dst_metric(&r->u.dst, RTAX_RTTVAR)),
 392                        r->fl.fl4_tos,
 393                        r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 394                        r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 395                                       dev_queue_xmit) : 0,
 396                        r->rt_spec_dst, &len);
 397
 398                seq_printf(seq, "%*s\n", 127 - len, "");
 399        }
 400        return 0;
 401}
 402
 403static const struct seq_operations rt_cache_seq_ops = {
 404        .start  = rt_cache_seq_start,
 405        .next   = rt_cache_seq_next,
 406        .stop   = rt_cache_seq_stop,
 407        .show   = rt_cache_seq_show,
 408};
 409
 410static int rt_cache_seq_open(struct inode *inode, struct file *file)
 411{
 412        return seq_open_net(inode, file, &rt_cache_seq_ops,
 413                        sizeof(struct rt_cache_iter_state));
 414}
 415
 416static const struct file_operations rt_cache_seq_fops = {
 417        .owner   = THIS_MODULE,
 418        .open    = rt_cache_seq_open,
 419        .read    = seq_read,
 420        .llseek  = seq_lseek,
 421        .release = seq_release_net,
 422};
 423
 424
 425static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 426{
 427        int cpu;
 428
 429        if (*pos == 0)
 430                return SEQ_START_TOKEN;
 431
 432        for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 433                if (!cpu_possible(cpu))
 434                        continue;
 435                *pos = cpu+1;
 436                return &per_cpu(rt_cache_stat, cpu);
 437        }
 438        return NULL;
 439}
 440
 441static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 442{
 443        int cpu;
 444
 445        for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 446                if (!cpu_possible(cpu))
 447                        continue;
 448                *pos = cpu+1;
 449                return &per_cpu(rt_cache_stat, cpu);
 450        }
 451        return NULL;
 452
 453}
 454
 455static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 456{
 457
 458}
 459
 460static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 461{
 462        struct rt_cache_stat *st = v;
 463
 464        if (v == SEQ_START_TOKEN) {
 465                seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 466                return 0;
 467        }
 468
 469        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 470                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 471                   atomic_read(&ipv4_dst_ops.entries),
 472                   st->in_hit,
 473                   st->in_slow_tot,
 474                   st->in_slow_mc,
 475                   st->in_no_route,
 476                   st->in_brd,
 477                   st->in_martian_dst,
 478                   st->in_martian_src,
 479
 480                   st->out_hit,
 481                   st->out_slow_tot,
 482                   st->out_slow_mc,
 483
 484                   st->gc_total,
 485                   st->gc_ignored,
 486                   st->gc_goal_miss,
 487                   st->gc_dst_overflow,
 488                   st->in_hlist_search,
 489                   st->out_hlist_search
 490                );
 491        return 0;
 492}
 493
 494static const struct seq_operations rt_cpu_seq_ops = {
 495        .start  = rt_cpu_seq_start,
 496        .next   = rt_cpu_seq_next,
 497        .stop   = rt_cpu_seq_stop,
 498        .show   = rt_cpu_seq_show,
 499};
 500
 501
 502static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 503{
 504        return seq_open(file, &rt_cpu_seq_ops);
 505}
 506
 507static const struct file_operations rt_cpu_seq_fops = {
 508        .owner   = THIS_MODULE,
 509        .open    = rt_cpu_seq_open,
 510        .read    = seq_read,
 511        .llseek  = seq_lseek,
 512        .release = seq_release,
 513};
 514
 515#ifdef CONFIG_NET_CLS_ROUTE
 516static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
 517                           int length, int *eof, void *data)
 518{
 519        unsigned int i;
 520
 521        if ((offset & 3) || (length & 3))
 522                return -EIO;
 523
 524        if (offset >= sizeof(struct ip_rt_acct) * 256) {
 525                *eof = 1;
 526                return 0;
 527        }
 528
 529        if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
 530                length = sizeof(struct ip_rt_acct) * 256 - offset;
 531                *eof = 1;
 532        }
 533
 534        offset /= sizeof(u32);
 535
 536        if (length > 0) {
 537                u32 *dst = (u32 *) buffer;
 538
 539                *start = buffer;
 540                memset(dst, 0, length);
 541
 542                for_each_possible_cpu(i) {
 543                        unsigned int j;
 544                        u32 *src;
 545
 546                        src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
 547                        for (j = 0; j < length/4; j++)
 548                                dst[j] += src[j];
 549                }
 550        }
 551        return length;
 552}
 553#endif
 554
 555static int __net_init ip_rt_do_proc_init(struct net *net)
 556{
 557        struct proc_dir_entry *pde;
 558
 559        pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 560                        &rt_cache_seq_fops);
 561        if (!pde)
 562                goto err1;
 563
 564        pde = proc_create("rt_cache", S_IRUGO,
 565                          net->proc_net_stat, &rt_cpu_seq_fops);
 566        if (!pde)
 567                goto err2;
 568
 569#ifdef CONFIG_NET_CLS_ROUTE
 570        pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
 571                        ip_rt_acct_read, NULL);
 572        if (!pde)
 573                goto err3;
 574#endif
 575        return 0;
 576
 577#ifdef CONFIG_NET_CLS_ROUTE
 578err3:
 579        remove_proc_entry("rt_cache", net->proc_net_stat);
 580#endif
 581err2:
 582        remove_proc_entry("rt_cache", net->proc_net);
 583err1:
 584        return -ENOMEM;
 585}
 586
 587static void __net_exit ip_rt_do_proc_exit(struct net *net)
 588{
 589        remove_proc_entry("rt_cache", net->proc_net_stat);
 590        remove_proc_entry("rt_cache", net->proc_net);
 591        remove_proc_entry("rt_acct", net->proc_net);
 592}
 593
 594static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 595        .init = ip_rt_do_proc_init,
 596        .exit = ip_rt_do_proc_exit,
 597};
 598
 599static int __init ip_rt_proc_init(void)
 600{
 601        return register_pernet_subsys(&ip_rt_proc_ops);
 602}
 603
 604#else
 605static inline int ip_rt_proc_init(void)
 606{
 607        return 0;
 608}
 609#endif /* CONFIG_PROC_FS */
 610
 611static inline void rt_free(struct rtable *rt)
 612{
 613        call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 614}
 615
 616static inline void rt_drop(struct rtable *rt)
 617{
 618        ip_rt_put(rt);
 619        call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 620}
 621
 622static inline int rt_fast_clean(struct rtable *rth)
 623{
 624        /* Kill broadcast/multicast entries very aggresively, if they
 625           collide in hash table with more useful entries */
 626        return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 627                rth->fl.iif && rth->u.dst.rt_next;
 628}
 629
 630static inline int rt_valuable(struct rtable *rth)
 631{
 632        return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 633                rth->u.dst.expires;
 634}
 635
 636static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 637{
 638        unsigned long age;
 639        int ret = 0;
 640
 641        if (atomic_read(&rth->u.dst.__refcnt))
 642                goto out;
 643
 644        ret = 1;
 645        if (rth->u.dst.expires &&
 646            time_after_eq(jiffies, rth->u.dst.expires))
 647                goto out;
 648
 649        age = jiffies - rth->u.dst.lastuse;
 650        ret = 0;
 651        if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 652            (age <= tmo2 && rt_valuable(rth)))
 653                goto out;
 654        ret = 1;
 655out:    return ret;
 656}
 657
 658/* Bits of score are:
 659 * 31: very valuable
 660 * 30: not quite useless
 661 * 29..0: usage counter
 662 */
 663static inline u32 rt_score(struct rtable *rt)
 664{
 665        u32 score = jiffies - rt->u.dst.lastuse;
 666
 667        score = ~score & ~(3<<30);
 668
 669        if (rt_valuable(rt))
 670                score |= (1<<31);
 671
 672        if (!rt->fl.iif ||
 673            !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 674                score |= (1<<30);
 675
 676        return score;
 677}
 678
 679static inline bool rt_caching(const struct net *net)
 680{
 681        return net->ipv4.current_rt_cache_rebuild_count <=
 682                net->ipv4.sysctl_rt_cache_rebuild_count;
 683}
 684
 685static inline bool compare_hash_inputs(const struct flowi *fl1,
 686                                        const struct flowi *fl2)
 687{
 688        return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 689                (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
 690                (fl1->iif ^ fl2->iif)) == 0);
 691}
 692
 693static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 694{
 695        return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 696                (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
 697                (fl1->mark ^ fl2->mark) |
 698                (*(u16 *)&fl1->nl_u.ip4_u.tos ^
 699                 *(u16 *)&fl2->nl_u.ip4_u.tos) |
 700                (fl1->oif ^ fl2->oif) |
 701                (fl1->iif ^ fl2->iif)) == 0;
 702}
 703
 704static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 705{
 706        return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
 707}
 708
 709static inline int rt_is_expired(struct rtable *rth)
 710{
 711        return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
 712}
 713
 714/*
 715 * Perform a full scan of hash table and free all entries.
 716 * Can be called by a softirq or a process.
 717 * In the later case, we want to be reschedule if necessary
 718 */
 719static void rt_do_flush(int process_context)
 720{
 721        unsigned int i;
 722        struct rtable *rth, *next;
 723        struct rtable * tail;
 724
 725        for (i = 0; i <= rt_hash_mask; i++) {
 726                if (process_context && need_resched())
 727                        cond_resched();
 728                rth = rt_hash_table[i].chain;
 729                if (!rth)
 730                        continue;
 731
 732                spin_lock_bh(rt_hash_lock_addr(i));
 733#ifdef CONFIG_NET_NS
 734                {
 735                struct rtable ** prev, * p;
 736
 737                rth = rt_hash_table[i].chain;
 738
 739                /* defer releasing the head of the list after spin_unlock */
 740                for (tail = rth; tail; tail = tail->u.dst.rt_next)
 741                        if (!rt_is_expired(tail))
 742                                break;
 743                if (rth != tail)
 744                        rt_hash_table[i].chain = tail;
 745
 746                /* call rt_free on entries after the tail requiring flush */
 747                prev = &rt_hash_table[i].chain;
 748                for (p = *prev; p; p = next) {
 749                        next = p->u.dst.rt_next;
 750                        if (!rt_is_expired(p)) {
 751                                prev = &p->u.dst.rt_next;
 752                        } else {
 753                                *prev = next;
 754                                rt_free(p);
 755                        }
 756                }
 757                }
 758#else
 759                rth = rt_hash_table[i].chain;
 760                rt_hash_table[i].chain = NULL;
 761                tail = NULL;
 762#endif
 763                spin_unlock_bh(rt_hash_lock_addr(i));
 764
 765                for (; rth != tail; rth = next) {
 766                        next = rth->u.dst.rt_next;
 767                        rt_free(rth);
 768                }
 769        }
 770}
 771
 772/*
 773 * While freeing expired entries, we compute average chain length
 774 * and standard deviation, using fixed-point arithmetic.
 775 * This to have an estimation of rt_chain_length_max
 776 *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 777 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 778 */
 779
 780#define FRACT_BITS 3
 781#define ONE (1UL << FRACT_BITS)
 782
 783static void rt_check_expire(void)
 784{
 785        static unsigned int rover;
 786        unsigned int i = rover, goal;
 787        struct rtable *rth, *aux, **rthp;
 788        unsigned long samples = 0;
 789        unsigned long sum = 0, sum2 = 0;
 790        unsigned long delta;
 791        u64 mult;
 792
 793        delta = jiffies - expires_ljiffies;
 794        expires_ljiffies = jiffies;
 795        mult = ((u64)delta) << rt_hash_log;
 796        if (ip_rt_gc_timeout > 1)
 797                do_div(mult, ip_rt_gc_timeout);
 798        goal = (unsigned int)mult;
 799        if (goal > rt_hash_mask)
 800                goal = rt_hash_mask + 1;
 801        for (; goal > 0; goal--) {
 802                unsigned long tmo = ip_rt_gc_timeout;
 803                unsigned long length;
 804
 805                i = (i + 1) & rt_hash_mask;
 806                rthp = &rt_hash_table[i].chain;
 807
 808                if (need_resched())
 809                        cond_resched();
 810
 811                samples++;
 812
 813                if (*rthp == NULL)
 814                        continue;
 815                length = 0;
 816                spin_lock_bh(rt_hash_lock_addr(i));
 817                while ((rth = *rthp) != NULL) {
 818                        prefetch(rth->u.dst.rt_next);
 819                        if (rt_is_expired(rth)) {
 820                                *rthp = rth->u.dst.rt_next;
 821                                rt_free(rth);
 822                                continue;
 823                        }
 824                        if (rth->u.dst.expires) {
 825                                /* Entry is expired even if it is in use */
 826                                if (time_before_eq(jiffies, rth->u.dst.expires)) {
 827nofree:
 828                                        tmo >>= 1;
 829                                        rthp = &rth->u.dst.rt_next;
 830                                        /*
 831                                         * We only count entries on
 832                                         * a chain with equal hash inputs once
 833                                         * so that entries for different QOS
 834                                         * levels, and other non-hash input
 835                                         * attributes don't unfairly skew
 836                                         * the length computation
 837                                         */
 838                                        for (aux = rt_hash_table[i].chain;;) {
 839                                                if (aux == rth) {
 840                                                        length += ONE;
 841                                                        break;
 842                                                }
 843                                                if (compare_hash_inputs(&aux->fl, &rth->fl))
 844                                                        break;
 845                                                aux = aux->u.dst.rt_next;
 846                                        }
 847                                        continue;
 848                                }
 849                        } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 850                                goto nofree;
 851
 852                        /* Cleanup aged off entries. */
 853                        *rthp = rth->u.dst.rt_next;
 854                        rt_free(rth);
 855                }
 856                spin_unlock_bh(rt_hash_lock_addr(i));
 857                sum += length;
 858                sum2 += length*length;
 859        }
 860        if (samples) {
 861                unsigned long avg = sum / samples;
 862                unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 863                rt_chain_length_max = max_t(unsigned long,
 864                                        ip_rt_gc_elasticity,
 865                                        (avg + 4*sd) >> FRACT_BITS);
 866        }
 867        rover = i;
 868}
 869
 870/*
 871 * rt_worker_func() is run in process context.
 872 * we call rt_check_expire() to scan part of the hash table
 873 */
 874static void rt_worker_func(struct work_struct *work)
 875{
 876        rt_check_expire();
 877        schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 878}
 879
 880/*
 881 * Pertubation of rt_genid by a small quantity [1..256]
 882 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 883 * many times (2^24) without giving recent rt_genid.
 884 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 885 */
 886static void rt_cache_invalidate(struct net *net)
 887{
 888        unsigned char shuffle;
 889
 890        get_random_bytes(&shuffle, sizeof(shuffle));
 891        atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 892}
 893
 894/*
 895 * delay < 0  : invalidate cache (fast : entries will be deleted later)
 896 * delay >= 0 : invalidate & flush cache (can be long)
 897 */
 898void rt_cache_flush(struct net *net, int delay)
 899{
 900        rt_cache_invalidate(net);
 901        if (delay >= 0)
 902                rt_do_flush(!in_softirq());
 903}
 904
 905/*
 906 * We change rt_genid and let gc do the cleanup
 907 */
 908static void rt_secret_rebuild(unsigned long __net)
 909{
 910        struct net *net = (struct net *)__net;
 911        rt_cache_invalidate(net);
 912        mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
 913}
 914
 915static void rt_secret_rebuild_oneshot(struct net *net)
 916{
 917        del_timer_sync(&net->ipv4.rt_secret_timer);
 918        rt_cache_invalidate(net);
 919        if (ip_rt_secret_interval) {
 920                net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
 921                add_timer(&net->ipv4.rt_secret_timer);
 922        }
 923}
 924
 925static void rt_emergency_hash_rebuild(struct net *net)
 926{
 927        if (net_ratelimit()) {
 928                printk(KERN_WARNING "Route hash chain too long!\n");
 929                printk(KERN_WARNING "Adjust your secret_interval!\n");
 930        }
 931
 932        rt_secret_rebuild_oneshot(net);
 933}
 934
 935/*
 936   Short description of GC goals.
 937
 938   We want to build algorithm, which will keep routing cache
 939   at some equilibrium point, when number of aged off entries
 940   is kept approximately equal to newly generated ones.
 941
 942   Current expiration strength is variable "expire".
 943   We try to adjust it dynamically, so that if networking
 944   is idle expires is large enough to keep enough of warm entries,
 945   and when load increases it reduces to limit cache size.
 946 */
 947
 948static int rt_garbage_collect(struct dst_ops *ops)
 949{
 950        static unsigned long expire = RT_GC_TIMEOUT;
 951        static unsigned long last_gc;
 952        static int rover;
 953        static int equilibrium;
 954        struct rtable *rth, **rthp;
 955        unsigned long now = jiffies;
 956        int goal;
 957
 958        /*
 959         * Garbage collection is pretty expensive,
 960         * do not make it too frequently.
 961         */
 962
 963        RT_CACHE_STAT_INC(gc_total);
 964
 965        if (now - last_gc < ip_rt_gc_min_interval &&
 966            atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 967                RT_CACHE_STAT_INC(gc_ignored);
 968                goto out;
 969        }
 970
 971        /* Calculate number of entries, which we want to expire now. */
 972        goal = atomic_read(&ipv4_dst_ops.entries) -
 973                (ip_rt_gc_elasticity << rt_hash_log);
 974        if (goal <= 0) {
 975                if (equilibrium < ipv4_dst_ops.gc_thresh)
 976                        equilibrium = ipv4_dst_ops.gc_thresh;
 977                goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 978                if (goal > 0) {
 979                        equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 980                        goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 981                }
 982        } else {
 983                /* We are in dangerous area. Try to reduce cache really
 984                 * aggressively.
 985                 */
 986                goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 987                equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 988        }
 989
 990        if (now - last_gc >= ip_rt_gc_min_interval)
 991                last_gc = now;
 992
 993        if (goal <= 0) {
 994                equilibrium += goal;
 995                goto work_done;
 996        }
 997
 998        do {
 999                int i, k;
1000
1001                for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1002                        unsigned long tmo = expire;
1003
1004                        k = (k + 1) & rt_hash_mask;
1005                        rthp = &rt_hash_table[k].chain;
1006                        spin_lock_bh(rt_hash_lock_addr(k));
1007                        while ((rth = *rthp) != NULL) {
1008                                if (!rt_is_expired(rth) &&
1009                                        !rt_may_expire(rth, tmo, expire)) {
1010                                        tmo >>= 1;
1011                                        rthp = &rth->u.dst.rt_next;
1012                                        continue;
1013                                }
1014                                *rthp = rth->u.dst.rt_next;
1015                                rt_free(rth);
1016                                goal--;
1017                        }
1018                        spin_unlock_bh(rt_hash_lock_addr(k));
1019                        if (goal <= 0)
1020                                break;
1021                }
1022                rover = k;
1023
1024                if (goal <= 0)
1025                        goto work_done;
1026
1027                /* Goal is not achieved. We stop process if:
1028
1029                   - if expire reduced to zero. Otherwise, expire is halfed.
1030                   - if table is not full.
1031                   - if we are called from interrupt.
1032                   - jiffies check is just fallback/debug loop breaker.
1033                     We will not spin here for long time in any case.
1034                 */
1035
1036                RT_CACHE_STAT_INC(gc_goal_miss);
1037
1038                if (expire == 0)
1039                        break;
1040
1041                expire >>= 1;
1042#if RT_CACHE_DEBUG >= 2
1043                printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1044                                atomic_read(&ipv4_dst_ops.entries), goal, i);
1045#endif
1046
1047                if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1048                        goto out;
1049        } while (!in_softirq() && time_before_eq(jiffies, now));
1050
1051        if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1052                goto out;
1053        if (net_ratelimit())
1054                printk(KERN_WARNING "dst cache overflow\n");
1055        RT_CACHE_STAT_INC(gc_dst_overflow);
1056        return 1;
1057
1058work_done:
1059        expire += ip_rt_gc_min_interval;
1060        if (expire > ip_rt_gc_timeout ||
1061            atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1062                expire = ip_rt_gc_timeout;
1063#if RT_CACHE_DEBUG >= 2
1064        printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1065                        atomic_read(&ipv4_dst_ops.entries), goal, rover);
1066#endif
1067out:    return 0;
1068}
1069
1070static int rt_intern_hash(unsigned hash, struct rtable *rt,
1071                          struct rtable **rp, struct sk_buff *skb)
1072{
1073        struct rtable   *rth, **rthp;
1074        unsigned long   now;
1075        struct rtable *cand, **candp;
1076        u32             min_score;
1077        int             chain_length;
1078        int attempts = !in_softirq();
1079
1080restart:
1081        chain_length = 0;
1082        min_score = ~(u32)0;
1083        cand = NULL;
1084        candp = NULL;
1085        now = jiffies;
1086
1087        if (!rt_caching(dev_net(rt->u.dst.dev))) {
1088                /*
1089                 * If we're not caching, just tell the caller we
1090                 * were successful and don't touch the route.  The
1091                 * caller hold the sole reference to the cache entry, and
1092                 * it will be released when the caller is done with it.
1093                 * If we drop it here, the callers have no way to resolve routes
1094                 * when we're not caching.  Instead, just point *rp at rt, so
1095                 * the caller gets a single use out of the route
1096                 * Note that we do rt_free on this new route entry, so that
1097                 * once its refcount hits zero, we are still able to reap it
1098                 * (Thanks Alexey)
1099                 * Note also the rt_free uses call_rcu.  We don't actually
1100                 * need rcu protection here, this is just our path to get
1101                 * on the route gc list.
1102                 */
1103
1104                if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1105                        int err = arp_bind_neighbour(&rt->u.dst);
1106                        if (err) {
1107                                if (net_ratelimit())
1108                                        printk(KERN_WARNING
1109                                            "Neighbour table failure & not caching routes.\n");
1110                                rt_drop(rt);
1111                                return err;
1112                        }
1113                }
1114
1115                rt_free(rt);
1116                goto skip_hashing;
1117        }
1118
1119        rthp = &rt_hash_table[hash].chain;
1120
1121        spin_lock_bh(rt_hash_lock_addr(hash));
1122        while ((rth = *rthp) != NULL) {
1123                if (rt_is_expired(rth)) {
1124                        *rthp = rth->u.dst.rt_next;
1125                        rt_free(rth);
1126                        continue;
1127                }
1128                if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1129                        /* Put it first */
1130                        *rthp = rth->u.dst.rt_next;
1131                        /*
1132                         * Since lookup is lockfree, the deletion
1133                         * must be visible to another weakly ordered CPU before
1134                         * the insertion at the start of the hash chain.
1135                         */
1136                        rcu_assign_pointer(rth->u.dst.rt_next,
1137                                           rt_hash_table[hash].chain);
1138                        /*
1139                         * Since lookup is lockfree, the update writes
1140                         * must be ordered for consistency on SMP.
1141                         */
1142                        rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1143
1144                        dst_use(&rth->u.dst, now);
1145                        spin_unlock_bh(rt_hash_lock_addr(hash));
1146
1147                        rt_drop(rt);
1148                        if (rp)
1149                                *rp = rth;
1150                        else
1151                                skb_dst_set(skb, &rth->u.dst);
1152                        return 0;
1153                }
1154
1155                if (!atomic_read(&rth->u.dst.__refcnt)) {
1156                        u32 score = rt_score(rth);
1157
1158                        if (score <= min_score) {
1159                                cand = rth;
1160                                candp = rthp;
1161                                min_score = score;
1162                        }
1163                }
1164
1165                chain_length++;
1166
1167                rthp = &rth->u.dst.rt_next;
1168        }
1169
1170        if (cand) {
1171                /* ip_rt_gc_elasticity used to be average length of chain
1172                 * length, when exceeded gc becomes really aggressive.
1173                 *
1174                 * The second limit is less certain. At the moment it allows
1175                 * only 2 entries per bucket. We will see.
1176                 */
1177                if (chain_length > ip_rt_gc_elasticity) {
1178                        *candp = cand->u.dst.rt_next;
1179                        rt_free(cand);
1180                }
1181        } else {
1182                if (chain_length > rt_chain_length_max) {
1183                        struct net *net = dev_net(rt->u.dst.dev);
1184                        int num = ++net->ipv4.current_rt_cache_rebuild_count;
1185                        if (!rt_caching(dev_net(rt->u.dst.dev))) {
1186                                printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1187                                        rt->u.dst.dev->name, num);
1188                        }
1189                        rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1190                }
1191        }
1192
1193        /* Try to bind route to arp only if it is output
1194           route or unicast forwarding path.
1195         */
1196        if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1197                int err = arp_bind_neighbour(&rt->u.dst);
1198                if (err) {
1199                        spin_unlock_bh(rt_hash_lock_addr(hash));
1200
1201                        if (err != -ENOBUFS) {
1202                                rt_drop(rt);
1203                                return err;
1204                        }
1205
1206                        /* Neighbour tables are full and nothing
1207                           can be released. Try to shrink route cache,
1208                           it is most likely it holds some neighbour records.
1209                         */
1210                        if (attempts-- > 0) {
1211                                int saved_elasticity = ip_rt_gc_elasticity;
1212                                int saved_int = ip_rt_gc_min_interval;
1213                                ip_rt_gc_elasticity     = 1;
1214                                ip_rt_gc_min_interval   = 0;
1215                                rt_garbage_collect(&ipv4_dst_ops);
1216                                ip_rt_gc_min_interval   = saved_int;
1217                                ip_rt_gc_elasticity     = saved_elasticity;
1218                                goto restart;
1219                        }
1220
1221                        if (net_ratelimit())
1222                                printk(KERN_WARNING "Neighbour table overflow.\n");
1223                        rt_drop(rt);
1224                        return -ENOBUFS;
1225                }
1226        }
1227
1228        rt->u.dst.rt_next = rt_hash_table[hash].chain;
1229
1230#if RT_CACHE_DEBUG >= 2
1231        if (rt->u.dst.rt_next) {
1232                struct rtable *trt;
1233                printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1234                       hash, &rt->rt_dst);
1235                for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1236                        printk(" . %pI4", &trt->rt_dst);
1237                printk("\n");
1238        }
1239#endif
1240        /*
1241         * Since lookup is lockfree, we must make sure
1242         * previous writes to rt are comitted to memory
1243         * before making rt visible to other CPUS.
1244         */
1245        rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1246
1247        spin_unlock_bh(rt_hash_lock_addr(hash));
1248
1249skip_hashing:
1250        if (rp)
1251                *rp = rt;
1252        else
1253                skb_dst_set(skb, &rt->u.dst);
1254        return 0;
1255}
1256
1257void rt_bind_peer(struct rtable *rt, int create)
1258{
1259        static DEFINE_SPINLOCK(rt_peer_lock);
1260        struct inet_peer *peer;
1261
1262        peer = inet_getpeer(rt->rt_dst, create);
1263
1264        spin_lock_bh(&rt_peer_lock);
1265        if (rt->peer == NULL) {
1266                rt->peer = peer;
1267                peer = NULL;
1268        }
1269        spin_unlock_bh(&rt_peer_lock);
1270        if (peer)
1271                inet_putpeer(peer);
1272}
1273
1274/*
1275 * Peer allocation may fail only in serious out-of-memory conditions.  However
1276 * we still can generate some output.
1277 * Random ID selection looks a bit dangerous because we have no chances to
1278 * select ID being unique in a reasonable period of time.
1279 * But broken packet identifier may be better than no packet at all.
1280 */
1281static void ip_select_fb_ident(struct iphdr *iph)
1282{
1283        static DEFINE_SPINLOCK(ip_fb_id_lock);
1284        static u32 ip_fallback_id;
1285        u32 salt;
1286
1287        spin_lock_bh(&ip_fb_id_lock);
1288        salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1289        iph->id = htons(salt & 0xFFFF);
1290        ip_fallback_id = salt;
1291        spin_unlock_bh(&ip_fb_id_lock);
1292}
1293
1294void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1295{
1296        struct rtable *rt = (struct rtable *) dst;
1297
1298        if (rt) {
1299                if (rt->peer == NULL)
1300                        rt_bind_peer(rt, 1);
1301
1302                /* If peer is attached to destination, it is never detached,
1303                   so that we need not to grab a lock to dereference it.
1304                 */
1305                if (rt->peer) {
1306                        iph->id = htons(inet_getid(rt->peer, more));
1307                        return;
1308                }
1309        } else
1310                printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1311                       __builtin_return_address(0));
1312
1313        ip_select_fb_ident(iph);
1314}
1315
1316static void rt_del(unsigned hash, struct rtable *rt)
1317{
1318        struct rtable **rthp, *aux;
1319
1320        rthp = &rt_hash_table[hash].chain;
1321        spin_lock_bh(rt_hash_lock_addr(hash));
1322        ip_rt_put(rt);
1323        while ((aux = *rthp) != NULL) {
1324                if (aux == rt || rt_is_expired(aux)) {
1325                        *rthp = aux->u.dst.rt_next;
1326                        rt_free(aux);
1327                        continue;
1328                }
1329                rthp = &aux->u.dst.rt_next;
1330        }
1331        spin_unlock_bh(rt_hash_lock_addr(hash));
1332}
1333
1334void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1335                    __be32 saddr, struct net_device *dev)
1336{
1337        int i, k;
1338        struct in_device *in_dev = in_dev_get(dev);
1339        struct rtable *rth, **rthp;
1340        __be32  skeys[2] = { saddr, 0 };
1341        int  ikeys[2] = { dev->ifindex, 0 };
1342        struct netevent_redirect netevent;
1343        struct net *net;
1344
1345        if (!in_dev)
1346                return;
1347
1348        net = dev_net(dev);
1349        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1350            || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1351            || ipv4_is_zeronet(new_gw))
1352                goto reject_redirect;
1353
1354        if (!rt_caching(net))
1355                goto reject_redirect;
1356
1357        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1358                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1359                        goto reject_redirect;
1360                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1361                        goto reject_redirect;
1362        } else {
1363                if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1364                        goto reject_redirect;
1365        }
1366
1367        for (i = 0; i < 2; i++) {
1368                for (k = 0; k < 2; k++) {
1369                        unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1370                                                rt_genid(net));
1371
1372                        rthp=&rt_hash_table[hash].chain;
1373
1374                        rcu_read_lock();
1375                        while ((rth = rcu_dereference(*rthp)) != NULL) {
1376                                struct rtable *rt;
1377
1378                                if (rth->fl.fl4_dst != daddr ||
1379                                    rth->fl.fl4_src != skeys[i] ||
1380                                    rth->fl.oif != ikeys[k] ||
1381                                    rth->fl.iif != 0 ||
1382                                    rt_is_expired(rth) ||
1383                                    !net_eq(dev_net(rth->u.dst.dev), net)) {
1384                                        rthp = &rth->u.dst.rt_next;
1385                                        continue;
1386                                }
1387
1388                                if (rth->rt_dst != daddr ||
1389                                    rth->rt_src != saddr ||
1390                                    rth->u.dst.error ||
1391                                    rth->rt_gateway != old_gw ||
1392                                    rth->u.dst.dev != dev)
1393                                        break;
1394
1395                                dst_hold(&rth->u.dst);
1396                                rcu_read_unlock();
1397
1398                                rt = dst_alloc(&ipv4_dst_ops);
1399                                if (rt == NULL) {
1400                                        ip_rt_put(rth);
1401                                        in_dev_put(in_dev);
1402                                        return;
1403                                }
1404
1405                                /* Copy all the information. */
1406                                *rt = *rth;
1407                                rt->u.dst.__use         = 1;
1408                                atomic_set(&rt->u.dst.__refcnt, 1);
1409                                rt->u.dst.child         = NULL;
1410                                if (rt->u.dst.dev)
1411                                        dev_hold(rt->u.dst.dev);
1412                                if (rt->idev)
1413                                        in_dev_hold(rt->idev);
1414                                rt->u.dst.obsolete      = 0;
1415                                rt->u.dst.lastuse       = jiffies;
1416                                rt->u.dst.path          = &rt->u.dst;
1417                                rt->u.dst.neighbour     = NULL;
1418                                rt->u.dst.hh            = NULL;
1419#ifdef CONFIG_XFRM
1420                                rt->u.dst.xfrm          = NULL;
1421#endif
1422                                rt->rt_genid            = rt_genid(net);
1423                                rt->rt_flags            |= RTCF_REDIRECTED;
1424
1425                                /* Gateway is different ... */
1426                                rt->rt_gateway          = new_gw;
1427
1428                                /* Redirect received -> path was valid */
1429                                dst_confirm(&rth->u.dst);
1430
1431                                if (rt->peer)
1432                                        atomic_inc(&rt->peer->refcnt);
1433
1434                                if (arp_bind_neighbour(&rt->u.dst) ||
1435                                    !(rt->u.dst.neighbour->nud_state &
1436                                            NUD_VALID)) {
1437                                        if (rt->u.dst.neighbour)
1438                                                neigh_event_send(rt->u.dst.neighbour, NULL);
1439                                        ip_rt_put(rth);
1440                                        rt_drop(rt);
1441                                        goto do_next;
1442                                }
1443
1444                                netevent.old = &rth->u.dst;
1445                                netevent.new = &rt->u.dst;
1446                                call_netevent_notifiers(NETEVENT_REDIRECT,
1447                                                        &netevent);
1448
1449                                rt_del(hash, rth);
1450                                if (!rt_intern_hash(hash, rt, &rt, NULL))
1451                                        ip_rt_put(rt);
1452                                goto do_next;
1453                        }
1454                        rcu_read_unlock();
1455                do_next:
1456                        ;
1457                }
1458        }
1459        in_dev_put(in_dev);
1460        return;
1461
1462reject_redirect:
1463#ifdef CONFIG_IP_ROUTE_VERBOSE
1464        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1465                printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1466                        "  Advised path = %pI4 -> %pI4\n",
1467                       &old_gw, dev->name, &new_gw,
1468                       &saddr, &daddr);
1469#endif
1470        in_dev_put(in_dev);
1471}
1472
1473static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1474{
1475        struct rtable *rt = (struct rtable *)dst;
1476        struct dst_entry *ret = dst;
1477
1478        if (rt) {
1479                if (dst->obsolete) {
1480                        ip_rt_put(rt);
1481                        ret = NULL;
1482                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1483                           rt->u.dst.expires) {
1484                        unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1485                                                rt->fl.oif,
1486                                                rt_genid(dev_net(dst->dev)));
1487#if RT_CACHE_DEBUG >= 1
1488                        printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1489                                &rt->rt_dst, rt->fl.fl4_tos);
1490#endif
1491                        rt_del(hash, rt);
1492                        ret = NULL;
1493                }
1494        }
1495        return ret;
1496}
1497
1498/*
1499 * Algorithm:
1500 *      1. The first ip_rt_redirect_number redirects are sent
1501 *         with exponential backoff, then we stop sending them at all,
1502 *         assuming that the host ignores our redirects.
1503 *      2. If we did not see packets requiring redirects
1504 *         during ip_rt_redirect_silence, we assume that the host
1505 *         forgot redirected route and start to send redirects again.
1506 *
1507 * This algorithm is much cheaper and more intelligent than dumb load limiting
1508 * in icmp.c.
1509 *
1510 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1511 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1512 */
1513
1514void ip_rt_send_redirect(struct sk_buff *skb)
1515{
1516        struct rtable *rt = skb_rtable(skb);
1517        struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1518
1519        if (!in_dev)
1520                return;
1521
1522        if (!IN_DEV_TX_REDIRECTS(in_dev))
1523                goto out;
1524
1525        /* No redirected packets during ip_rt_redirect_silence;
1526         * reset the algorithm.
1527         */
1528        if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1529                rt->u.dst.rate_tokens = 0;
1530
1531        /* Too many ignored redirects; do not send anything
1532         * set u.dst.rate_last to the last seen redirected packet.
1533         */
1534        if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1535                rt->u.dst.rate_last = jiffies;
1536                goto out;
1537        }
1538
1539        /* Check for load limit; set rate_last to the latest sent
1540         * redirect.
1541         */
1542        if (rt->u.dst.rate_tokens == 0 ||
1543            time_after(jiffies,
1544                       (rt->u.dst.rate_last +
1545                        (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1546                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1547                rt->u.dst.rate_last = jiffies;
1548                ++rt->u.dst.rate_tokens;
1549#ifdef CONFIG_IP_ROUTE_VERBOSE
1550                if (IN_DEV_LOG_MARTIANS(in_dev) &&
1551                    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1552                    net_ratelimit())
1553                        printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1554                                &rt->rt_src, rt->rt_iif,
1555                                &rt->rt_dst, &rt->rt_gateway);
1556#endif
1557        }
1558out:
1559        in_dev_put(in_dev);
1560}
1561
1562static int ip_error(struct sk_buff *skb)
1563{
1564        struct rtable *rt = skb_rtable(skb);
1565        unsigned long now;
1566        int code;
1567
1568        switch (rt->u.dst.error) {
1569                case EINVAL:
1570                default:
1571                        goto out;
1572                case EHOSTUNREACH:
1573                        code = ICMP_HOST_UNREACH;
1574                        break;
1575                case ENETUNREACH:
1576                        code = ICMP_NET_UNREACH;
1577                        IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1578                                        IPSTATS_MIB_INNOROUTES);
1579                        break;
1580                case EACCES:
1581                        code = ICMP_PKT_FILTERED;
1582                        break;
1583        }
1584
1585        now = jiffies;
1586        rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1587        if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1588                rt->u.dst.rate_tokens = ip_rt_error_burst;
1589        rt->u.dst.rate_last = now;
1590        if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1591                rt->u.dst.rate_tokens -= ip_rt_error_cost;
1592                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1593        }
1594
1595out:    kfree_skb(skb);
1596        return 0;
1597}
1598
1599/*
1600 *      The last two values are not from the RFC but
1601 *      are needed for AMPRnet AX.25 paths.
1602 */
1603
1604static const unsigned short mtu_plateau[] =
1605{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1606
1607static inline unsigned short guess_mtu(unsigned short old_mtu)
1608{
1609        int i;
1610
1611        for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1612                if (old_mtu > mtu_plateau[i])
1613                        return mtu_plateau[i];
1614        return 68;
1615}
1616
1617unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1618                                 unsigned short new_mtu,
1619                                 struct net_device *dev)
1620{
1621        int i, k;
1622        unsigned short old_mtu = ntohs(iph->tot_len);
1623        struct rtable *rth;
1624        int  ikeys[2] = { dev->ifindex, 0 };
1625        __be32  skeys[2] = { iph->saddr, 0, };
1626        __be32  daddr = iph->daddr;
1627        unsigned short est_mtu = 0;
1628
1629        if (ipv4_config.no_pmtu_disc)
1630                return 0;
1631
1632        for (k = 0; k < 2; k++) {
1633                for (i = 0; i < 2; i++) {
1634                        unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1635                                                rt_genid(net));
1636
1637                        rcu_read_lock();
1638                        for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1639                             rth = rcu_dereference(rth->u.dst.rt_next)) {
1640                                unsigned short mtu = new_mtu;
1641
1642                                if (rth->fl.fl4_dst != daddr ||
1643                                    rth->fl.fl4_src != skeys[i] ||
1644                                    rth->rt_dst != daddr ||
1645                                    rth->rt_src != iph->saddr ||
1646                                    rth->fl.oif != ikeys[k] ||
1647                                    rth->fl.iif != 0 ||
1648                                    dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1649                                    !net_eq(dev_net(rth->u.dst.dev), net) ||
1650                                    rt_is_expired(rth))
1651                                        continue;
1652
1653                                if (new_mtu < 68 || new_mtu >= old_mtu) {
1654
1655                                        /* BSD 4.2 compatibility hack :-( */
1656                                        if (mtu == 0 &&
1657                                            old_mtu >= dst_mtu(&rth->u.dst) &&
1658                                            old_mtu >= 68 + (iph->ihl << 2))
1659                                                old_mtu -= iph->ihl << 2;
1660
1661                                        mtu = guess_mtu(old_mtu);
1662                                }
1663                                if (mtu <= dst_mtu(&rth->u.dst)) {
1664                                        if (mtu < dst_mtu(&rth->u.dst)) {
1665                                                dst_confirm(&rth->u.dst);
1666                                                if (mtu < ip_rt_min_pmtu) {
1667                                                        mtu = ip_rt_min_pmtu;
1668                                                        rth->u.dst.metrics[RTAX_LOCK-1] |=
1669                                                                (1 << RTAX_MTU);
1670                                                }
1671                                                rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1672                                                dst_set_expires(&rth->u.dst,
1673                                                        ip_rt_mtu_expires);
1674                                        }
1675                                        est_mtu = mtu;
1676                                }
1677                        }
1678                        rcu_read_unlock();
1679                }
1680        }
1681        return est_mtu ? : new_mtu;
1682}
1683
1684static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1685{
1686        if (dst_mtu(dst) > mtu && mtu >= 68 &&
1687            !(dst_metric_locked(dst, RTAX_MTU))) {
1688                if (mtu < ip_rt_min_pmtu) {
1689                        mtu = ip_rt_min_pmtu;
1690                        dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1691                }
1692                dst->metrics[RTAX_MTU-1] = mtu;
1693                dst_set_expires(dst, ip_rt_mtu_expires);
1694                call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1695        }
1696}
1697
1698static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1699{
1700        return NULL;
1701}
1702
1703static void ipv4_dst_destroy(struct dst_entry *dst)
1704{
1705        struct rtable *rt = (struct rtable *) dst;
1706        struct inet_peer *peer = rt->peer;
1707        struct in_device *idev = rt->idev;
1708
1709        if (peer) {
1710                rt->peer = NULL;
1711                inet_putpeer(peer);
1712        }
1713
1714        if (idev) {
1715                rt->idev = NULL;
1716                in_dev_put(idev);
1717        }
1718}
1719
1720static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1721                            int how)
1722{
1723        struct rtable *rt = (struct rtable *) dst;
1724        struct in_device *idev = rt->idev;
1725        if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1726                struct in_device *loopback_idev =
1727                        in_dev_get(dev_net(dev)->loopback_dev);
1728                if (loopback_idev) {
1729                        rt->idev = loopback_idev;
1730                        in_dev_put(idev);
1731                }
1732        }
1733}
1734
1735static void ipv4_link_failure(struct sk_buff *skb)
1736{
1737        struct rtable *rt;
1738
1739        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1740
1741        rt = skb_rtable(skb);
1742        if (rt)
1743                dst_set_expires(&rt->u.dst, 0);
1744}
1745
1746static int ip_rt_bug(struct sk_buff *skb)
1747{
1748        printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1749                &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1750                skb->dev ? skb->dev->name : "?");
1751        kfree_skb(skb);
1752        return 0;
1753}
1754
1755/*
1756   We do not cache source address of outgoing interface,
1757   because it is used only by IP RR, TS and SRR options,
1758   so that it out of fast path.
1759
1760   BTW remember: "addr" is allowed to be not aligned
1761   in IP options!
1762 */
1763
1764void ip_rt_get_source(u8 *addr, struct rtable *rt)
1765{
1766        __be32 src;
1767        struct fib_result res;
1768
1769        if (rt->fl.iif == 0)
1770                src = rt->rt_src;
1771        else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1772                src = FIB_RES_PREFSRC(res);
1773                fib_res_put(&res);
1774        } else
1775                src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1776                                        RT_SCOPE_UNIVERSE);
1777        memcpy(addr, &src, 4);
1778}
1779
1780#ifdef CONFIG_NET_CLS_ROUTE
1781static void set_class_tag(struct rtable *rt, u32 tag)
1782{
1783        if (!(rt->u.dst.tclassid & 0xFFFF))
1784                rt->u.dst.tclassid |= tag & 0xFFFF;
1785        if (!(rt->u.dst.tclassid & 0xFFFF0000))
1786                rt->u.dst.tclassid |= tag & 0xFFFF0000;
1787}
1788#endif
1789
1790static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1791{
1792        struct fib_info *fi = res->fi;
1793
1794        if (fi) {
1795                if (FIB_RES_GW(*res) &&
1796                    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1797                        rt->rt_gateway = FIB_RES_GW(*res);
1798                memcpy(rt->u.dst.metrics, fi->fib_metrics,
1799                       sizeof(rt->u.dst.metrics));
1800                if (fi->fib_mtu == 0) {
1801                        rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1802                        if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1803                            rt->rt_gateway != rt->rt_dst &&
1804                            rt->u.dst.dev->mtu > 576)
1805                                rt->u.dst.metrics[RTAX_MTU-1] = 576;
1806                }
1807#ifdef CONFIG_NET_CLS_ROUTE
1808                rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1809#endif
1810        } else
1811                rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1812
1813        if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1814                rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1815        if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1816                rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1817        if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1818                rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1819                                       ip_rt_min_advmss);
1820        if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1821                rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1822
1823#ifdef CONFIG_NET_CLS_ROUTE
1824#ifdef CONFIG_IP_MULTIPLE_TABLES
1825        set_class_tag(rt, fib_rules_tclass(res));
1826#endif
1827        set_class_tag(rt, itag);
1828#endif
1829        rt->rt_type = res->type;
1830}
1831
1832static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1833                                u8 tos, struct net_device *dev, int our)
1834{
1835        unsigned hash;
1836        struct rtable *rth;
1837        __be32 spec_dst;
1838        struct in_device *in_dev = in_dev_get(dev);
1839        u32 itag = 0;
1840
1841        /* Primary sanity checks. */
1842
1843        if (in_dev == NULL)
1844                return -EINVAL;
1845
1846        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1847            ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1848                goto e_inval;
1849
1850        if (ipv4_is_zeronet(saddr)) {
1851                if (!ipv4_is_local_multicast(daddr))
1852                        goto e_inval;
1853                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1854        } else if (fib_validate_source(saddr, 0, tos, 0,
1855                                        dev, &spec_dst, &itag) < 0)
1856                goto e_inval;
1857
1858        rth = dst_alloc(&ipv4_dst_ops);
1859        if (!rth)
1860                goto e_nobufs;
1861
1862        rth->u.dst.output= ip_rt_bug;
1863
1864        atomic_set(&rth->u.dst.__refcnt, 1);
1865        rth->u.dst.flags= DST_HOST;
1866        if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1867                rth->u.dst.flags |= DST_NOPOLICY;
1868        rth->fl.fl4_dst = daddr;
1869        rth->rt_dst     = daddr;
1870        rth->fl.fl4_tos = tos;
1871        rth->fl.mark    = skb->mark;
1872        rth->fl.fl4_src = saddr;
1873        rth->rt_src     = saddr;
1874#ifdef CONFIG_NET_CLS_ROUTE
1875        rth->u.dst.tclassid = itag;
1876#endif
1877        rth->rt_iif     =
1878        rth->fl.iif     = dev->ifindex;
1879        rth->u.dst.dev  = init_net.loopback_dev;
1880        dev_hold(rth->u.dst.dev);
1881        rth->idev       = in_dev_get(rth->u.dst.dev);
1882        rth->fl.oif     = 0;
1883        rth->rt_gateway = daddr;
1884        rth->rt_spec_dst= spec_dst;
1885        rth->rt_genid   = rt_genid(dev_net(dev));
1886        rth->rt_flags   = RTCF_MULTICAST;
1887        rth->rt_type    = RTN_MULTICAST;
1888        if (our) {
1889                rth->u.dst.input= ip_local_deliver;
1890                rth->rt_flags |= RTCF_LOCAL;
1891        }
1892
1893#ifdef CONFIG_IP_MROUTE
1894        if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1895                rth->u.dst.input = ip_mr_input;
1896#endif
1897        RT_CACHE_STAT_INC(in_slow_mc);
1898
1899        in_dev_put(in_dev);
1900        hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1901        return rt_intern_hash(hash, rth, NULL, skb);
1902
1903e_nobufs:
1904        in_dev_put(in_dev);
1905        return -ENOBUFS;
1906
1907e_inval:
1908        in_dev_put(in_dev);
1909        return -EINVAL;
1910}
1911
1912
1913static void ip_handle_martian_source(struct net_device *dev,
1914                                     struct in_device *in_dev,
1915                                     struct sk_buff *skb,
1916                                     __be32 daddr,
1917                                     __be32 saddr)
1918{
1919        RT_CACHE_STAT_INC(in_martian_src);
1920#ifdef CONFIG_IP_ROUTE_VERBOSE
1921        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1922                /*
1923                 *      RFC1812 recommendation, if source is martian,
1924                 *      the only hint is MAC header.
1925                 */
1926                printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1927                        &daddr, &saddr, dev->name);
1928                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1929                        int i;
1930                        const unsigned char *p = skb_mac_header(skb);
1931                        printk(KERN_WARNING "ll header: ");
1932                        for (i = 0; i < dev->hard_header_len; i++, p++) {
1933                                printk("%02x", *p);
1934                                if (i < (dev->hard_header_len - 1))
1935                                        printk(":");
1936                        }
1937                        printk("\n");
1938                }
1939        }
1940#endif
1941}
1942
1943static int __mkroute_input(struct sk_buff *skb,
1944                           struct fib_result *res,
1945                           struct in_device *in_dev,
1946                           __be32 daddr, __be32 saddr, u32 tos,
1947                           struct rtable **result)
1948{
1949
1950        struct rtable *rth;
1951        int err;
1952        struct in_device *out_dev;
1953        unsigned flags = 0;
1954        __be32 spec_dst;
1955        u32 itag;
1956
1957        /* get a working reference to the output device */
1958        out_dev = in_dev_get(FIB_RES_DEV(*res));
1959        if (out_dev == NULL) {
1960                if (net_ratelimit())
1961                        printk(KERN_CRIT "Bug in ip_route_input" \
1962                               "_slow(). Please, report\n");
1963                return -EINVAL;
1964        }
1965
1966
1967        err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1968                                  in_dev->dev, &spec_dst, &itag);
1969        if (err < 0) {
1970                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1971                                         saddr);
1972
1973                err = -EINVAL;
1974                goto cleanup;
1975        }
1976
1977        if (err)
1978                flags |= RTCF_DIRECTSRC;
1979
1980        if (out_dev == in_dev && err &&
1981            (IN_DEV_SHARED_MEDIA(out_dev) ||
1982             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1983                flags |= RTCF_DOREDIRECT;
1984
1985        if (skb->protocol != htons(ETH_P_IP)) {
1986                /* Not IP (i.e. ARP). Do not create route, if it is
1987                 * invalid for proxy arp. DNAT routes are always valid.
1988                 */
1989                if (out_dev == in_dev) {
1990                        err = -EINVAL;
1991                        goto cleanup;
1992                }
1993        }
1994
1995
1996        rth = dst_alloc(&ipv4_dst_ops);
1997        if (!rth) {
1998                err = -ENOBUFS;
1999                goto cleanup;
2000        }
2001
2002        atomic_set(&rth->u.dst.__refcnt, 1);
2003        rth->u.dst.flags= DST_HOST;
2004        if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2005                rth->u.dst.flags |= DST_NOPOLICY;
2006        if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2007                rth->u.dst.flags |= DST_NOXFRM;
2008        rth->fl.fl4_dst = daddr;
2009        rth->rt_dst     = daddr;
2010        rth->fl.fl4_tos = tos;
2011        rth->fl.mark    = skb->mark;
2012        rth->fl.fl4_src = saddr;
2013        rth->rt_src     = saddr;
2014        rth->rt_gateway = daddr;
2015        rth->rt_iif     =
2016                rth->fl.iif     = in_dev->dev->ifindex;
2017        rth->u.dst.dev  = (out_dev)->dev;
2018        dev_hold(rth->u.dst.dev);
2019        rth->idev       = in_dev_get(rth->u.dst.dev);
2020        rth->fl.oif     = 0;
2021        rth->rt_spec_dst= spec_dst;
2022
2023        rth->u.dst.input = ip_forward;
2024        rth->u.dst.output = ip_output;
2025        rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
2026
2027        rt_set_nexthop(rth, res, itag);
2028
2029        rth->rt_flags = flags;
2030
2031        *result = rth;
2032        err = 0;
2033 cleanup:
2034        /* release the working reference to the output device */
2035        in_dev_put(out_dev);
2036        return err;
2037}
2038
2039static int ip_mkroute_input(struct sk_buff *skb,
2040                            struct fib_result *res,
2041                            const struct flowi *fl,
2042                            struct in_device *in_dev,
2043                            __be32 daddr, __be32 saddr, u32 tos)
2044{
2045        struct rtable* rth = NULL;
2046        int err;
2047        unsigned hash;
2048
2049#ifdef CONFIG_IP_ROUTE_MULTIPATH
2050        if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2051                fib_select_multipath(fl, res);
2052#endif
2053
2054        /* create a routing cache entry */
2055        err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2056        if (err)
2057                return err;
2058
2059        /* put it into the cache */
2060        hash = rt_hash(daddr, saddr, fl->iif,
2061                       rt_genid(dev_net(rth->u.dst.dev)));
2062        return rt_intern_hash(hash, rth, NULL, skb);
2063}
2064
2065/*
2066 *      NOTE. We drop all the packets that has local source
2067 *      addresses, because every properly looped back packet
2068 *      must have correct destination already attached by output routine.
2069 *
2070 *      Such approach solves two big problems:
2071 *      1. Not simplex devices are handled properly.
2072 *      2. IP spoofing attempts are filtered with 100% of guarantee.
2073 */
2074
2075static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2076                               u8 tos, struct net_device *dev)
2077{
2078        struct fib_result res;
2079        struct in_device *in_dev = in_dev_get(dev);
2080        struct flowi fl = { .nl_u = { .ip4_u =
2081                                      { .daddr = daddr,
2082                                        .saddr = saddr,
2083                                        .tos = tos,
2084                                        .scope = RT_SCOPE_UNIVERSE,
2085                                      } },
2086                            .mark = skb->mark,
2087                            .iif = dev->ifindex };
2088        unsigned        flags = 0;
2089        u32             itag = 0;
2090        struct rtable * rth;
2091        unsigned        hash;
2092        __be32          spec_dst;
2093        int             err = -EINVAL;
2094        int             free_res = 0;
2095        struct net    * net = dev_net(dev);
2096
2097        /* IP on this device is disabled. */
2098
2099        if (!in_dev)
2100                goto out;
2101
2102        /* Check for the most weird martians, which can be not detected
2103           by fib_lookup.
2104         */
2105
2106        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2107            ipv4_is_loopback(saddr))
2108                goto martian_source;
2109
2110        if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2111                goto brd_input;
2112
2113        /* Accept zero addresses only to limited broadcast;
2114         * I even do not know to fix it or not. Waiting for complains :-)
2115         */
2116        if (ipv4_is_zeronet(saddr))
2117                goto martian_source;
2118
2119        if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2120            ipv4_is_loopback(daddr))
2121                goto martian_destination;
2122
2123        /*
2124         *      Now we are ready to route packet.
2125         */
2126        if ((err = fib_lookup(net, &fl, &res)) != 0) {
2127                if (!IN_DEV_FORWARD(in_dev))
2128                        goto e_hostunreach;
2129                goto no_route;
2130        }
2131        free_res = 1;
2132
2133        RT_CACHE_STAT_INC(in_slow_tot);
2134
2135        if (res.type == RTN_BROADCAST)
2136                goto brd_input;
2137
2138        if (res.type == RTN_LOCAL) {
2139                int result;
2140                result = fib_validate_source(saddr, daddr, tos,
2141                                             net->loopback_dev->ifindex,
2142                                             dev, &spec_dst, &itag);
2143                if (result < 0)
2144                        goto martian_source;
2145                if (result)
2146                        flags |= RTCF_DIRECTSRC;
2147                spec_dst = daddr;
2148                goto local_input;
2149        }
2150
2151        if (!IN_DEV_FORWARD(in_dev))
2152                goto e_hostunreach;
2153        if (res.type != RTN_UNICAST)
2154                goto martian_destination;
2155
2156        err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2157done:
2158        in_dev_put(in_dev);
2159        if (free_res)
2160                fib_res_put(&res);
2161out:    return err;
2162
2163brd_input:
2164        if (skb->protocol != htons(ETH_P_IP))
2165                goto e_inval;
2166
2167        if (ipv4_is_zeronet(saddr))
2168                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2169        else {
2170                err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2171                                          &itag);
2172                if (err < 0)
2173                        goto martian_source;
2174                if (err)
2175                        flags |= RTCF_DIRECTSRC;
2176        }
2177        flags |= RTCF_BROADCAST;
2178        res.type = RTN_BROADCAST;
2179        RT_CACHE_STAT_INC(in_brd);
2180
2181local_input:
2182        rth = dst_alloc(&ipv4_dst_ops);
2183        if (!rth)
2184                goto e_nobufs;
2185
2186        rth->u.dst.output= ip_rt_bug;
2187        rth->rt_genid = rt_genid(net);
2188
2189        atomic_set(&rth->u.dst.__refcnt, 1);
2190        rth->u.dst.flags= DST_HOST;
2191        if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2192                rth->u.dst.flags |= DST_NOPOLICY;
2193        rth->fl.fl4_dst = daddr;
2194        rth->rt_dst     = daddr;
2195        rth->fl.fl4_tos = tos;
2196        rth->fl.mark    = skb->mark;
2197        rth->fl.fl4_src = saddr;
2198        rth->rt_src     = saddr;
2199#ifdef CONFIG_NET_CLS_ROUTE
2200        rth->u.dst.tclassid = itag;
2201#endif
2202        rth->rt_iif     =
2203        rth->fl.iif     = dev->ifindex;
2204        rth->u.dst.dev  = net->loopback_dev;
2205        dev_hold(rth->u.dst.dev);
2206        rth->idev       = in_dev_get(rth->u.dst.dev);
2207        rth->rt_gateway = daddr;
2208        rth->rt_spec_dst= spec_dst;
2209        rth->u.dst.input= ip_local_deliver;
2210        rth->rt_flags   = flags|RTCF_LOCAL;
2211        if (res.type == RTN_UNREACHABLE) {
2212                rth->u.dst.input= ip_error;
2213                rth->u.dst.error= -err;
2214                rth->rt_flags   &= ~RTCF_LOCAL;
2215        }
2216        rth->rt_type    = res.type;
2217        hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2218        err = rt_intern_hash(hash, rth, NULL, skb);
2219        goto done;
2220
2221no_route:
2222        RT_CACHE_STAT_INC(in_no_route);
2223        spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2224        res.type = RTN_UNREACHABLE;
2225        if (err == -ESRCH)
2226                err = -ENETUNREACH;
2227        goto local_input;
2228
2229        /*
2230         *      Do not cache martian addresses: they should be logged (RFC1812)
2231         */
2232martian_destination:
2233        RT_CACHE_STAT_INC(in_martian_dst);
2234#ifdef CONFIG_IP_ROUTE_VERBOSE
2235        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2236                printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2237                        &daddr, &saddr, dev->name);
2238#endif
2239
2240e_hostunreach:
2241        err = -EHOSTUNREACH;
2242        goto done;
2243
2244e_inval:
2245        err = -EINVAL;
2246        goto done;
2247
2248e_nobufs:
2249        err = -ENOBUFS;
2250        goto done;
2251
2252martian_source:
2253        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2254        goto e_inval;
2255}
2256
2257int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2258                   u8 tos, struct net_device *dev)
2259{
2260        struct rtable * rth;
2261        unsigned        hash;
2262        int iif = dev->ifindex;
2263        struct net *net;
2264
2265        net = dev_net(dev);
2266
2267        if (!rt_caching(net))
2268                goto skip_cache;
2269
2270        tos &= IPTOS_RT_MASK;
2271        hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2272
2273        rcu_read_lock();
2274        for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2275             rth = rcu_dereference(rth->u.dst.rt_next)) {
2276                if (((rth->fl.fl4_dst ^ daddr) |
2277                     (rth->fl.fl4_src ^ saddr) |
2278                     (rth->fl.iif ^ iif) |
2279                     rth->fl.oif |
2280                     (rth->fl.fl4_tos ^ tos)) == 0 &&
2281                    rth->fl.mark == skb->mark &&
2282                    net_eq(dev_net(rth->u.dst.dev), net) &&
2283                    !rt_is_expired(rth)) {
2284                        dst_use(&rth->u.dst, jiffies);
2285                        RT_CACHE_STAT_INC(in_hit);
2286                        rcu_read_unlock();
2287                        skb_dst_set(skb, &rth->u.dst);
2288                        return 0;
2289                }
2290                RT_CACHE_STAT_INC(in_hlist_search);
2291        }
2292        rcu_read_unlock();
2293
2294skip_cache:
2295        /* Multicast recognition logic is moved from route cache to here.
2296           The problem was that too many Ethernet cards have broken/missing
2297           hardware multicast filters :-( As result the host on multicasting
2298           network acquires a lot of useless route cache entries, sort of
2299           SDR messages from all the world. Now we try to get rid of them.
2300           Really, provided software IP multicast filter is organized
2301           reasonably (at least, hashed), it does not result in a slowdown
2302           comparing with route cache reject entries.
2303           Note, that multicast routers are not affected, because
2304           route cache entry is created eventually.
2305         */
2306        if (ipv4_is_multicast(daddr)) {
2307                struct in_device *in_dev;
2308
2309                rcu_read_lock();
2310                if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2311                        int our = ip_check_mc(in_dev, daddr, saddr,
2312                                ip_hdr(skb)->protocol);
2313                        if (our
2314#ifdef CONFIG_IP_MROUTE
2315                            || (!ipv4_is_local_multicast(daddr) &&
2316                                IN_DEV_MFORWARD(in_dev))
2317#endif
2318                            ) {
2319                                rcu_read_unlock();
2320                                return ip_route_input_mc(skb, daddr, saddr,
2321                                                         tos, dev, our);
2322                        }
2323                }
2324                rcu_read_unlock();
2325                return -EINVAL;
2326        }
2327        return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2328}
2329
2330static int __mkroute_output(struct rtable **result,
2331                            struct fib_result *res,
2332                            const struct flowi *fl,
2333                            const struct flowi *oldflp,
2334                            struct net_device *dev_out,
2335                            unsigned flags)
2336{
2337        struct rtable *rth;
2338        struct in_device *in_dev;
2339        u32 tos = RT_FL_TOS(oldflp);
2340        int err = 0;
2341
2342        if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2343                return -EINVAL;
2344
2345        if (fl->fl4_dst == htonl(0xFFFFFFFF))
2346                res->type = RTN_BROADCAST;
2347        else if (ipv4_is_multicast(fl->fl4_dst))
2348                res->type = RTN_MULTICAST;
2349        else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2350                return -EINVAL;
2351
2352        if (dev_out->flags & IFF_LOOPBACK)
2353                flags |= RTCF_LOCAL;
2354
2355        /* get work reference to inet device */
2356        in_dev = in_dev_get(dev_out);
2357        if (!in_dev)
2358                return -EINVAL;
2359
2360        if (res->type == RTN_BROADCAST) {
2361                flags |= RTCF_BROADCAST | RTCF_LOCAL;
2362                if (res->fi) {
2363                        fib_info_put(res->fi);
2364                        res->fi = NULL;
2365                }
2366        } else if (res->type == RTN_MULTICAST) {
2367                flags |= RTCF_MULTICAST|RTCF_LOCAL;
2368                if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2369                                 oldflp->proto))
2370                        flags &= ~RTCF_LOCAL;
2371                /* If multicast route do not exist use
2372                   default one, but do not gateway in this case.
2373                   Yes, it is hack.
2374                 */
2375                if (res->fi && res->prefixlen < 4) {
2376                        fib_info_put(res->fi);
2377                        res->fi = NULL;
2378                }
2379        }
2380
2381
2382        rth = dst_alloc(&ipv4_dst_ops);
2383        if (!rth) {
2384                err = -ENOBUFS;
2385                goto cleanup;
2386        }
2387
2388        atomic_set(&rth->u.dst.__refcnt, 1);
2389        rth->u.dst.flags= DST_HOST;
2390        if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2391                rth->u.dst.flags |= DST_NOXFRM;
2392        if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2393                rth->u.dst.flags |= DST_NOPOLICY;
2394
2395        rth->fl.fl4_dst = oldflp->fl4_dst;
2396        rth->fl.fl4_tos = tos;
2397        rth->fl.fl4_src = oldflp->fl4_src;
2398        rth->fl.oif     = oldflp->oif;
2399        rth->fl.mark    = oldflp->mark;
2400        rth->rt_dst     = fl->fl4_dst;
2401        rth->rt_src     = fl->fl4_src;
2402        rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2403        /* get references to the devices that are to be hold by the routing
2404           cache entry */
2405        rth->u.dst.dev  = dev_out;
2406        dev_hold(dev_out);
2407        rth->idev       = in_dev_get(dev_out);
2408        rth->rt_gateway = fl->fl4_dst;
2409        rth->rt_spec_dst= fl->fl4_src;
2410
2411        rth->u.dst.output=ip_output;
2412        rth->rt_genid = rt_genid(dev_net(dev_out));
2413
2414        RT_CACHE_STAT_INC(out_slow_tot);
2415
2416        if (flags & RTCF_LOCAL) {
2417                rth->u.dst.input = ip_local_deliver;
2418                rth->rt_spec_dst = fl->fl4_dst;
2419        }
2420        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2421                rth->rt_spec_dst = fl->fl4_src;
2422                if (flags & RTCF_LOCAL &&
2423                    !(dev_out->flags & IFF_LOOPBACK)) {
2424                        rth->u.dst.output = ip_mc_output;
2425                        RT_CACHE_STAT_INC(out_slow_mc);
2426                }
2427#ifdef CONFIG_IP_MROUTE
2428                if (res->type == RTN_MULTICAST) {
2429                        if (IN_DEV_MFORWARD(in_dev) &&
2430                            !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2431                                rth->u.dst.input = ip_mr_input;
2432                                rth->u.dst.output = ip_mc_output;
2433                        }
2434                }
2435#endif
2436        }
2437
2438        rt_set_nexthop(rth, res, 0);
2439
2440        rth->rt_flags = flags;
2441
2442        *result = rth;
2443 cleanup:
2444        /* release work reference to inet device */
2445        in_dev_put(in_dev);
2446
2447        return err;
2448}
2449
2450static int ip_mkroute_output(struct rtable **rp,
2451                             struct fib_result *res,
2452                             const struct flowi *fl,
2453                             const struct flowi *oldflp,
2454                             struct net_device *dev_out,
2455                             unsigned flags)
2456{
2457        struct rtable *rth = NULL;
2458        int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2459        unsigned hash;
2460        if (err == 0) {
2461                hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2462                               rt_genid(dev_net(dev_out)));
2463                err = rt_intern_hash(hash, rth, rp, NULL);
2464        }
2465
2466        return err;
2467}
2468
2469/*
2470 * Major route resolver routine.
2471 */
2472
2473static int ip_route_output_slow(struct net *net, struct rtable **rp,
2474                                const struct flowi *oldflp)
2475{
2476        u32 tos = RT_FL_TOS(oldflp);
2477        struct flowi fl = { .nl_u = { .ip4_u =
2478                                      { .daddr = oldflp->fl4_dst,
2479                                        .saddr = oldflp->fl4_src,
2480                                        .tos = tos & IPTOS_RT_MASK,
2481                                        .scope = ((tos & RTO_ONLINK) ?
2482                                                  RT_SCOPE_LINK :
2483                                                  RT_SCOPE_UNIVERSE),
2484                                      } },
2485                            .mark = oldflp->mark,
2486                            .iif = net->loopback_dev->ifindex,
2487                            .oif = oldflp->oif };
2488        struct fib_result res;
2489        unsigned flags = 0;
2490        struct net_device *dev_out = NULL;
2491        int free_res = 0;
2492        int err;
2493
2494
2495        res.fi          = NULL;
2496#ifdef CONFIG_IP_MULTIPLE_TABLES
2497        res.r           = NULL;
2498#endif
2499
2500        if (oldflp->fl4_src) {
2501                err = -EINVAL;
2502                if (ipv4_is_multicast(oldflp->fl4_src) ||
2503                    ipv4_is_lbcast(oldflp->fl4_src) ||
2504                    ipv4_is_zeronet(oldflp->fl4_src))
2505                        goto out;
2506
2507                /* I removed check for oif == dev_out->oif here.
2508                   It was wrong for two reasons:
2509                   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2510                      is assigned to multiple interfaces.
2511                   2. Moreover, we are allowed to send packets with saddr
2512                      of another iface. --ANK
2513                 */
2514
2515                if (oldflp->oif == 0
2516                    && (ipv4_is_multicast(oldflp->fl4_dst) ||
2517                        oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2518                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2519                        dev_out = ip_dev_find(net, oldflp->fl4_src);
2520                        if (dev_out == NULL)
2521                                goto out;
2522
2523                        /* Special hack: user can direct multicasts
2524                           and limited broadcast via necessary interface
2525                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2526                           This hack is not just for fun, it allows
2527                           vic,vat and friends to work.
2528                           They bind socket to loopback, set ttl to zero
2529                           and expect that it will work.
2530                           From the viewpoint of routing cache they are broken,
2531                           because we are not allowed to build multicast path
2532                           with loopback source addr (look, routing cache
2533                           cannot know, that ttl is zero, so that packet
2534                           will not leave this host and route is valid).
2535                           Luckily, this hack is good workaround.
2536                         */
2537
2538                        fl.oif = dev_out->ifindex;
2539                        goto make_route;
2540                }
2541
2542                if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2543                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2544                        dev_out = ip_dev_find(net, oldflp->fl4_src);
2545                        if (dev_out == NULL)
2546                                goto out;
2547                        dev_put(dev_out);
2548                        dev_out = NULL;
2549                }
2550        }
2551
2552
2553        if (oldflp->oif) {
2554                dev_out = dev_get_by_index(net, oldflp->oif);
2555                err = -ENODEV;
2556                if (dev_out == NULL)
2557                        goto out;
2558
2559                /* RACE: Check return value of inet_select_addr instead. */
2560                if (__in_dev_get_rtnl(dev_out) == NULL) {
2561                        dev_put(dev_out);
2562                        goto out;       /* Wrong error code */
2563                }
2564
2565                if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2566                    oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2567                        if (!fl.fl4_src)
2568                                fl.fl4_src = inet_select_addr(dev_out, 0,
2569                                                              RT_SCOPE_LINK);
2570                        goto make_route;
2571                }
2572                if (!fl.fl4_src) {
2573                        if (ipv4_is_multicast(oldflp->fl4_dst))
2574                                fl.fl4_src = inet_select_addr(dev_out, 0,
2575                                                              fl.fl4_scope);
2576                        else if (!oldflp->fl4_dst)
2577                                fl.fl4_src = inet_select_addr(dev_out, 0,
2578                                                              RT_SCOPE_HOST);
2579                }
2580        }
2581
2582        if (!fl.fl4_dst) {
2583                fl.fl4_dst = fl.fl4_src;
2584                if (!fl.fl4_dst)
2585                        fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2586                if (dev_out)
2587                        dev_put(dev_out);
2588                dev_out = net->loopback_dev;
2589                dev_hold(dev_out);
2590                fl.oif = net->loopback_dev->ifindex;
2591                res.type = RTN_LOCAL;
2592                flags |= RTCF_LOCAL;
2593                goto make_route;
2594        }
2595
2596        if (fib_lookup(net, &fl, &res)) {
2597                res.fi = NULL;
2598                if (oldflp->oif) {
2599                        /* Apparently, routing tables are wrong. Assume,
2600                           that the destination is on link.
2601
2602                           WHY? DW.
2603                           Because we are allowed to send to iface
2604                           even if it has NO routes and NO assigned
2605                           addresses. When oif is specified, routing
2606                           tables are looked up with only one purpose:
2607                           to catch if destination is gatewayed, rather than
2608                           direct. Moreover, if MSG_DONTROUTE is set,
2609                           we send packet, ignoring both routing tables
2610                           and ifaddr state. --ANK
2611
2612
2613                           We could make it even if oif is unknown,
2614                           likely IPv6, but we do not.
2615                         */
2616
2617                        if (fl.fl4_src == 0)
2618                                fl.fl4_src = inet_select_addr(dev_out, 0,
2619                                                              RT_SCOPE_LINK);
2620                        res.type = RTN_UNICAST;
2621                        goto make_route;
2622                }
2623                if (dev_out)
2624                        dev_put(dev_out);
2625                err = -ENETUNREACH;
2626                goto out;
2627        }
2628        free_res = 1;
2629
2630        if (res.type == RTN_LOCAL) {
2631                if (!fl.fl4_src)
2632                        fl.fl4_src = fl.fl4_dst;
2633                if (dev_out)
2634                        dev_put(dev_out);
2635                dev_out = net->loopback_dev;
2636                dev_hold(dev_out);
2637                fl.oif = dev_out->ifindex;
2638                if (res.fi)
2639                        fib_info_put(res.fi);
2640                res.fi = NULL;
2641                flags |= RTCF_LOCAL;
2642                goto make_route;
2643        }
2644
2645#ifdef CONFIG_IP_ROUTE_MULTIPATH
2646        if (res.fi->fib_nhs > 1 && fl.oif == 0)
2647                fib_select_multipath(&fl, &res);
2648        else
2649#endif
2650        if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2651                fib_select_default(net, &fl, &res);
2652
2653        if (!fl.fl4_src)
2654                fl.fl4_src = FIB_RES_PREFSRC(res);
2655
2656        if (dev_out)
2657                dev_put(dev_out);
2658        dev_out = FIB_RES_DEV(res);
2659        dev_hold(dev_out);
2660        fl.oif = dev_out->ifindex;
2661
2662
2663make_route:
2664        err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2665
2666
2667        if (free_res)
2668                fib_res_put(&res);
2669        if (dev_out)
2670                dev_put(dev_out);
2671out:    return err;
2672}
2673
2674int __ip_route_output_key(struct net *net, struct rtable **rp,
2675                          const struct flowi *flp)
2676{
2677        unsigned hash;
2678        struct rtable *rth;
2679
2680        if (!rt_caching(net))
2681                goto slow_output;
2682
2683        hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2684
2685        rcu_read_lock_bh();
2686        for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2687                rth = rcu_dereference(rth->u.dst.rt_next)) {
2688                if (rth->fl.fl4_dst == flp->fl4_dst &&
2689                    rth->fl.fl4_src == flp->fl4_src &&
2690                    rth->fl.iif == 0 &&
2691                    rth->fl.oif == flp->oif &&
2692                    rth->fl.mark == flp->mark &&
2693                    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2694                            (IPTOS_RT_MASK | RTO_ONLINK)) &&
2695                    net_eq(dev_net(rth->u.dst.dev), net) &&
2696                    !rt_is_expired(rth)) {
2697                        dst_use(&rth->u.dst, jiffies);
2698                        RT_CACHE_STAT_INC(out_hit);
2699                        rcu_read_unlock_bh();
2700                        *rp = rth;
2701                        return 0;
2702                }
2703                RT_CACHE_STAT_INC(out_hlist_search);
2704        }
2705        rcu_read_unlock_bh();
2706
2707slow_output:
2708        return ip_route_output_slow(net, rp, flp);
2709}
2710
2711EXPORT_SYMBOL_GPL(__ip_route_output_key);
2712
2713static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2714{
2715}
2716
2717static struct dst_ops ipv4_dst_blackhole_ops = {
2718        .family                 =       AF_INET,
2719        .protocol               =       cpu_to_be16(ETH_P_IP),
2720        .destroy                =       ipv4_dst_destroy,
2721        .check                  =       ipv4_dst_check,
2722        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2723        .entries                =       ATOMIC_INIT(0),
2724};
2725
2726
2727static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2728{
2729        struct rtable *ort = *rp;
2730        struct rtable *rt = (struct rtable *)
2731                dst_alloc(&ipv4_dst_blackhole_ops);
2732
2733        if (rt) {
2734                struct dst_entry *new = &rt->u.dst;
2735
2736                atomic_set(&new->__refcnt, 1);
2737                new->__use = 1;
2738                new->input = dst_discard;
2739                new->output = dst_discard;
2740                memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2741
2742                new->dev = ort->u.dst.dev;
2743                if (new->dev)
2744                        dev_hold(new->dev);
2745
2746                rt->fl = ort->fl;
2747
2748                rt->idev = ort->idev;
2749                if (rt->idev)
2750                        in_dev_hold(rt->idev);
2751                rt->rt_genid = rt_genid(net);
2752                rt->rt_flags = ort->rt_flags;
2753                rt->rt_type = ort->rt_type;
2754                rt->rt_dst = ort->rt_dst;
2755                rt->rt_src = ort->rt_src;
2756                rt->rt_iif = ort->rt_iif;
2757                rt->rt_gateway = ort->rt_gateway;
2758                rt->rt_spec_dst = ort->rt_spec_dst;
2759                rt->peer = ort->peer;
2760                if (rt->peer)
2761                        atomic_inc(&rt->peer->refcnt);
2762
2763                dst_free(new);
2764        }
2765
2766        dst_release(&(*rp)->u.dst);
2767        *rp = rt;
2768        return (rt ? 0 : -ENOMEM);
2769}
2770
2771int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2772                         struct sock *sk, int flags)
2773{
2774        int err;
2775
2776        if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2777                return err;
2778
2779        if (flp->proto) {
2780                if (!flp->fl4_src)
2781                        flp->fl4_src = (*rp)->rt_src;
2782                if (!flp->fl4_dst)
2783                        flp->fl4_dst = (*rp)->rt_dst;
2784                err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2785                                    flags ? XFRM_LOOKUP_WAIT : 0);
2786                if (err == -EREMOTE)
2787                        err = ipv4_dst_blackhole(net, rp, flp);
2788
2789                return err;
2790        }
2791
2792        return 0;
2793}
2794
2795EXPORT_SYMBOL_GPL(ip_route_output_flow);
2796
2797int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2798{
2799        return ip_route_output_flow(net, rp, flp, NULL, 0);
2800}
2801
2802static int rt_fill_info(struct net *net,
2803                        struct sk_buff *skb, u32 pid, u32 seq, int event,
2804                        int nowait, unsigned int flags)
2805{
2806        struct rtable *rt = skb_rtable(skb);
2807        struct rtmsg *r;
2808        struct nlmsghdr *nlh;
2809        long expires;
2810        u32 id = 0, ts = 0, tsage = 0, error;
2811
2812        nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2813        if (nlh == NULL)
2814                return -EMSGSIZE;
2815
2816        r = nlmsg_data(nlh);
2817        r->rtm_family    = AF_INET;
2818        r->rtm_dst_len  = 32;
2819        r->rtm_src_len  = 0;
2820        r->rtm_tos      = rt->fl.fl4_tos;
2821        r->rtm_table    = RT_TABLE_MAIN;
2822        NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2823        r->rtm_type     = rt->rt_type;
2824        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2825        r->rtm_protocol = RTPROT_UNSPEC;
2826        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2827        if (rt->rt_flags & RTCF_NOTIFY)
2828                r->rtm_flags |= RTM_F_NOTIFY;
2829
2830        NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2831
2832        if (rt->fl.fl4_src) {
2833                r->rtm_src_len = 32;
2834                NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2835        }
2836        if (rt->u.dst.dev)
2837                NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2838#ifdef CONFIG_NET_CLS_ROUTE
2839        if (rt->u.dst.tclassid)
2840                NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2841#endif
2842        if (rt->fl.iif)
2843                NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2844        else if (rt->rt_src != rt->fl.fl4_src)
2845                NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2846
2847        if (rt->rt_dst != rt->rt_gateway)
2848                NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2849
2850        if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2851                goto nla_put_failure;
2852
2853        error = rt->u.dst.error;
2854        expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2855        if (rt->peer) {
2856                id = rt->peer->ip_id_count;
2857                if (rt->peer->tcp_ts_stamp) {
2858                        ts = rt->peer->tcp_ts;
2859                        tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2860                }
2861        }
2862
2863        if (rt->fl.iif) {
2864#ifdef CONFIG_IP_MROUTE
2865                __be32 dst = rt->rt_dst;
2866
2867                if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2868                    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2869                        int err = ipmr_get_route(net, skb, r, nowait);
2870                        if (err <= 0) {
2871                                if (!nowait) {
2872                                        if (err == 0)
2873                                                return 0;
2874                                        goto nla_put_failure;
2875                                } else {
2876                                        if (err == -EMSGSIZE)
2877                                                goto nla_put_failure;
2878                                        error = err;
2879                                }
2880                        }
2881                } else
2882#endif
2883                        NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2884        }
2885
2886        if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2887                               expires, error) < 0)
2888                goto nla_put_failure;
2889
2890        return nlmsg_end(skb, nlh);
2891
2892nla_put_failure:
2893        nlmsg_cancel(skb, nlh);
2894        return -EMSGSIZE;
2895}
2896
2897static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2898{
2899        struct net *net = sock_net(in_skb->sk);
2900        struct rtmsg *rtm;
2901        struct nlattr *tb[RTA_MAX+1];
2902        struct rtable *rt = NULL;
2903        __be32 dst = 0;
2904        __be32 src = 0;
2905        u32 iif;
2906        int err;
2907        struct sk_buff *skb;
2908
2909        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2910        if (err < 0)
2911                goto errout;
2912
2913        rtm = nlmsg_data(nlh);
2914
2915        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2916        if (skb == NULL) {
2917                err = -ENOBUFS;
2918                goto errout;
2919        }
2920
2921        /* Reserve room for dummy headers, this skb can pass
2922           through good chunk of routing engine.
2923         */
2924        skb_reset_mac_header(skb);
2925        skb_reset_network_header(skb);
2926
2927        /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2928        ip_hdr(skb)->protocol = IPPROTO_ICMP;
2929        skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2930
2931        src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2932        dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2933        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2934
2935        if (iif) {
2936                struct net_device *dev;
2937
2938                dev = __dev_get_by_index(net, iif);
2939                if (dev == NULL) {
2940                        err = -ENODEV;
2941                        goto errout_free;
2942                }
2943
2944                skb->protocol   = htons(ETH_P_IP);
2945                skb->dev        = dev;
2946                local_bh_disable();
2947                err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2948                local_bh_enable();
2949
2950                rt = skb_rtable(skb);
2951                if (err == 0 && rt->u.dst.error)
2952                        err = -rt->u.dst.error;
2953        } else {
2954                struct flowi fl = {
2955                        .nl_u = {
2956                                .ip4_u = {
2957                                        .daddr = dst,
2958                                        .saddr = src,
2959                                        .tos = rtm->rtm_tos,
2960                                },
2961                        },
2962                        .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2963                };
2964                err = ip_route_output_key(net, &rt, &fl);
2965        }
2966
2967        if (err)
2968                goto errout_free;
2969
2970        skb_dst_set(skb, &rt->u.dst);
2971        if (rtm->rtm_flags & RTM_F_NOTIFY)
2972                rt->rt_flags |= RTCF_NOTIFY;
2973
2974        err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2975                           RTM_NEWROUTE, 0, 0);
2976        if (err <= 0)
2977                goto errout_free;
2978
2979        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2980errout:
2981        return err;
2982
2983errout_free:
2984        kfree_skb(skb);
2985        goto errout;
2986}
2987
2988int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2989{
2990        struct rtable *rt;
2991        int h, s_h;
2992        int idx, s_idx;
2993        struct net *net;
2994
2995        net = sock_net(skb->sk);
2996
2997        s_h = cb->args[0];
2998        if (s_h < 0)
2999                s_h = 0;
3000        s_idx = idx = cb->args[1];
3001        for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3002                if (!rt_hash_table[h].chain)
3003                        continue;
3004                rcu_read_lock_bh();
3005                for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
3006                     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
3007                        if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
3008                                continue;
3009                        if (rt_is_expired(rt))
3010                                continue;
3011                        skb_dst_set(skb, dst_clone(&rt->u.dst));
3012                        if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3013                                         cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3014                                         1, NLM_F_MULTI) <= 0) {
3015                                skb_dst_drop(skb);
3016                                rcu_read_unlock_bh();
3017                                goto done;
3018                        }
3019                        skb_dst_drop(skb);
3020                }
3021                rcu_read_unlock_bh();
3022        }
3023
3024done:
3025        cb->args[0] = h;
3026        cb->args[1] = idx;
3027        return skb->len;
3028}
3029
3030void ip_rt_multicast_event(struct in_device *in_dev)
3031{
3032        rt_cache_flush(dev_net(in_dev->dev), 0);
3033}
3034
3035#ifdef CONFIG_SYSCTL
3036static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3037                                        struct file *filp, void __user *buffer,
3038                                        size_t *lenp, loff_t *ppos)
3039{
3040        if (write) {
3041                int flush_delay;
3042                ctl_table ctl;
3043                struct net *net;
3044
3045                memcpy(&ctl, __ctl, sizeof(ctl));
3046                ctl.data = &flush_delay;
3047                proc_dointvec(&ctl, write, filp, buffer, lenp, ppos);
3048
3049                net = (struct net *)__ctl->extra1;
3050                rt_cache_flush(net, flush_delay);
3051                return 0;
3052        }
3053
3054        return -EINVAL;
3055}
3056
3057static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
3058                                                void __user *oldval,
3059                                                size_t __user *oldlenp,
3060                                                void __user *newval,
3061                                                size_t newlen)
3062{
3063        int delay;
3064        struct net *net;
3065        if (newlen != sizeof(int))
3066                return -EINVAL;
3067        if (get_user(delay, (int __user *)newval))
3068                return -EFAULT;
3069        net = (struct net *)table->extra1;
3070        rt_cache_flush(net, delay);
3071        return 0;
3072}
3073
3074static void rt_secret_reschedule(int old)
3075{
3076        struct net *net;
3077        int new = ip_rt_secret_interval;
3078        int diff = new - old;
3079
3080        if (!diff)
3081                return;
3082
3083        rtnl_lock();
3084        for_each_net(net) {
3085                int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3086
3087                if (!new)
3088                        continue;
3089
3090                if (deleted) {
3091                        long time = net->ipv4.rt_secret_timer.expires - jiffies;
3092
3093                        if (time <= 0 || (time += diff) <= 0)
3094                                time = 0;
3095
3096                        net->ipv4.rt_secret_timer.expires = time;
3097                } else
3098                        net->ipv4.rt_secret_timer.expires = new;
3099
3100                net->ipv4.rt_secret_timer.expires += jiffies;
3101                add_timer(&net->ipv4.rt_secret_timer);
3102        }
3103        rtnl_unlock();
3104}
3105
3106static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3107                                          struct file *filp,
3108                                          void __user *buffer, size_t *lenp,
3109                                          loff_t *ppos)
3110{
3111        int old = ip_rt_secret_interval;
3112        int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos);
3113
3114        rt_secret_reschedule(old);
3115
3116        return ret;
3117}
3118
3119static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table,
3120                                                   void __user *oldval,
3121                                                   size_t __user *oldlenp,
3122                                                   void __user *newval,
3123                                                   size_t newlen)
3124{
3125        int old = ip_rt_secret_interval;
3126        int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen);
3127
3128        rt_secret_reschedule(old);
3129
3130        return ret;
3131}
3132
3133static ctl_table ipv4_route_table[] = {
3134        {
3135                .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
3136                .procname       = "gc_thresh",
3137                .data           = &ipv4_dst_ops.gc_thresh,
3138                .maxlen         = sizeof(int),
3139                .mode           = 0644,
3140                .proc_handler   = proc_dointvec,
3141        },
3142        {
3143                .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
3144                .procname       = "max_size",
3145                .data           = &ip_rt_max_size,
3146                .maxlen         = sizeof(int),
3147                .mode           = 0644,
3148                .proc_handler   = proc_dointvec,
3149        },
3150        {
3151                /*  Deprecated. Use gc_min_interval_ms */
3152
3153                .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
3154                .procname       = "gc_min_interval",
3155                .data           = &ip_rt_gc_min_interval,
3156                .maxlen         = sizeof(int),
3157                .mode           = 0644,
3158                .proc_handler   = proc_dointvec_jiffies,
3159                .strategy       = sysctl_jiffies,
3160        },
3161        {
3162                .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
3163                .procname       = "gc_min_interval_ms",
3164                .data           = &ip_rt_gc_min_interval,
3165                .maxlen         = sizeof(int),
3166                .mode           = 0644,
3167                .proc_handler   = proc_dointvec_ms_jiffies,
3168                .strategy       = sysctl_ms_jiffies,
3169        },
3170        {
3171                .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
3172                .procname       = "gc_timeout",
3173                .data           = &ip_rt_gc_timeout,
3174                .maxlen         = sizeof(int),
3175                .mode           = 0644,
3176                .proc_handler   = proc_dointvec_jiffies,
3177                .strategy       = sysctl_jiffies,
3178        },
3179        {
3180                .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
3181                .procname       = "gc_interval",
3182                .data           = &ip_rt_gc_interval,
3183                .maxlen         = sizeof(int),
3184                .mode           = 0644,
3185                .proc_handler   = proc_dointvec_jiffies,
3186                .strategy       = sysctl_jiffies,
3187        },
3188        {
3189                .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
3190                .procname       = "redirect_load",
3191                .data           = &ip_rt_redirect_load,
3192                .maxlen         = sizeof(int),
3193                .mode           = 0644,
3194                .proc_handler   = proc_dointvec,
3195        },
3196        {
3197                .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
3198                .procname       = "redirect_number",
3199                .data           = &ip_rt_redirect_number,
3200                .maxlen         = sizeof(int),
3201                .mode           = 0644,
3202                .proc_handler   = proc_dointvec,
3203        },
3204        {
3205                .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
3206                .procname       = "redirect_silence",
3207                .data           = &ip_rt_redirect_silence,
3208                .maxlen         = sizeof(int),
3209                .mode           = 0644,
3210                .proc_handler   = proc_dointvec,
3211        },
3212        {
3213                .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
3214                .procname       = "error_cost",
3215                .data           = &ip_rt_error_cost,
3216                .maxlen         = sizeof(int),
3217                .mode           = 0644,
3218                .proc_handler   = proc_dointvec,
3219        },
3220        {
3221                .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
3222                .procname       = "error_burst",
3223                .data           = &ip_rt_error_burst,
3224                .maxlen         = sizeof(int),
3225                .mode           = 0644,
3226                .proc_handler   = proc_dointvec,
3227        },
3228        {
3229                .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
3230                .procname       = "gc_elasticity",
3231                .data           = &ip_rt_gc_elasticity,
3232                .maxlen         = sizeof(int),
3233                .mode           = 0644,
3234                .proc_handler   = proc_dointvec,
3235        },
3236        {
3237                .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3238                .procname       = "mtu_expires",
3239                .data           = &ip_rt_mtu_expires,
3240                .maxlen         = sizeof(int),
3241                .mode           = 0644,
3242                .proc_handler   = proc_dointvec_jiffies,
3243                .strategy       = sysctl_jiffies,
3244        },
3245        {
3246                .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3247                .procname       = "min_pmtu",
3248                .data           = &ip_rt_min_pmtu,
3249                .maxlen         = sizeof(int),
3250                .mode           = 0644,
3251                .proc_handler   = proc_dointvec,
3252        },
3253        {
3254                .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3255                .procname       = "min_adv_mss",
3256                .data           = &ip_rt_min_advmss,
3257                .maxlen         = sizeof(int),
3258                .mode           = 0644,
3259                .proc_handler   = proc_dointvec,
3260        },
3261        {
3262                .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3263                .procname       = "secret_interval",
3264                .data           = &ip_rt_secret_interval,
3265                .maxlen         = sizeof(int),
3266                .mode           = 0644,
3267                .proc_handler   = ipv4_sysctl_rt_secret_interval,
3268                .strategy       = ipv4_sysctl_rt_secret_interval_strategy,
3269        },
3270        { .ctl_name = 0 }
3271};
3272
3273static struct ctl_table empty[1];
3274
3275static struct ctl_table ipv4_skeleton[] =
3276{
3277        { .procname = "route", .ctl_name = NET_IPV4_ROUTE,
3278          .mode = 0555, .child = ipv4_route_table},
3279        { .procname = "neigh", .ctl_name = NET_IPV4_NEIGH,
3280          .mode = 0555, .child = empty},
3281        { }
3282};
3283
3284static __net_initdata struct ctl_path ipv4_path[] = {
3285        { .procname = "net", .ctl_name = CTL_NET, },
3286        { .procname = "ipv4", .ctl_name = NET_IPV4, },
3287        { },
3288};
3289
3290static struct ctl_table ipv4_route_flush_table[] = {
3291        {
3292                .ctl_name       = NET_IPV4_ROUTE_FLUSH,
3293                .procname       = "flush",
3294                .maxlen         = sizeof(int),
3295                .mode           = 0200,
3296                .proc_handler   = ipv4_sysctl_rtcache_flush,
3297                .strategy       = ipv4_sysctl_rtcache_flush_strategy,
3298        },
3299        { .ctl_name = 0 },
3300};
3301
3302static __net_initdata struct ctl_path ipv4_route_path[] = {
3303        { .procname = "net", .ctl_name = CTL_NET, },
3304        { .procname = "ipv4", .ctl_name = NET_IPV4, },
3305        { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3306        { },
3307};
3308
3309static __net_init int sysctl_route_net_init(struct net *net)
3310{
3311        struct ctl_table *tbl;
3312
3313        tbl = ipv4_route_flush_table;
3314        if (net != &init_net) {
3315                tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3316                if (tbl == NULL)
3317                        goto err_dup;
3318        }
3319        tbl[0].extra1 = net;
3320
3321        net->ipv4.route_hdr =
3322                register_net_sysctl_table(net, ipv4_route_path, tbl);
3323        if (net->ipv4.route_hdr == NULL)
3324                goto err_reg;
3325        return 0;
3326
3327err_reg:
3328        if (tbl != ipv4_route_flush_table)
3329                kfree(tbl);
3330err_dup:
3331        return -ENOMEM;
3332}
3333
3334static __net_exit void sysctl_route_net_exit(struct net *net)
3335{
3336        struct ctl_table *tbl;
3337
3338        tbl = net->ipv4.route_hdr->ctl_table_arg;
3339        unregister_net_sysctl_table(net->ipv4.route_hdr);
3340        BUG_ON(tbl == ipv4_route_flush_table);
3341        kfree(tbl);
3342}
3343
3344static __net_initdata struct pernet_operations sysctl_route_ops = {
3345        .init = sysctl_route_net_init,
3346        .exit = sysctl_route_net_exit,
3347};
3348#endif
3349
3350
3351static __net_init int rt_secret_timer_init(struct net *net)
3352{
3353        atomic_set(&net->ipv4.rt_genid,
3354                        (int) ((num_physpages ^ (num_physpages>>8)) ^
3355                        (jiffies ^ (jiffies >> 7))));
3356
3357        net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3358        net->ipv4.rt_secret_timer.data = (unsigned long)net;
3359        init_timer_deferrable(&net->ipv4.rt_secret_timer);
3360
3361        if (ip_rt_secret_interval) {
3362                net->ipv4.rt_secret_timer.expires =
3363                        jiffies + net_random() % ip_rt_secret_interval +
3364                        ip_rt_secret_interval;
3365                add_timer(&net->ipv4.rt_secret_timer);
3366        }
3367        return 0;
3368}
3369
3370static __net_exit void rt_secret_timer_exit(struct net *net)
3371{
3372        del_timer_sync(&net->ipv4.rt_secret_timer);
3373}
3374
3375static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3376        .init = rt_secret_timer_init,
3377        .exit = rt_secret_timer_exit,
3378};
3379
3380
3381#ifdef CONFIG_NET_CLS_ROUTE
3382struct ip_rt_acct *ip_rt_acct __read_mostly;
3383#endif /* CONFIG_NET_CLS_ROUTE */
3384
3385static __initdata unsigned long rhash_entries;
3386static int __init set_rhash_entries(char *str)
3387{
3388        if (!str)
3389                return 0;
3390        rhash_entries = simple_strtoul(str, &str, 0);
3391        return 1;
3392}
3393__setup("rhash_entries=", set_rhash_entries);
3394
3395int __init ip_rt_init(void)
3396{
3397        int rc = 0;
3398
3399#ifdef CONFIG_NET_CLS_ROUTE
3400        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3401        if (!ip_rt_acct)
3402                panic("IP: failed to allocate ip_rt_acct\n");
3403#endif
3404
3405        ipv4_dst_ops.kmem_cachep =
3406                kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3407                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3408
3409        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3410
3411        rt_hash_table = (struct rt_hash_bucket *)
3412                alloc_large_system_hash("IP route cache",
3413                                        sizeof(struct rt_hash_bucket),
3414                                        rhash_entries,
3415                                        (num_physpages >= 128 * 1024) ?
3416                                        15 : 17,
3417                                        0,
3418                                        &rt_hash_log,
3419                                        &rt_hash_mask,
3420                                        rhash_entries ? 0 : 512 * 1024);
3421        memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3422        rt_hash_lock_init();
3423
3424        ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3425        ip_rt_max_size = (rt_hash_mask + 1) * 16;
3426
3427        devinet_init();
3428        ip_fib_init();
3429
3430        /* All the timers, started at system startup tend
3431           to synchronize. Perturb it a bit.
3432         */
3433        INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3434        expires_ljiffies = jiffies;
3435        schedule_delayed_work(&expires_work,
3436                net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3437
3438        if (register_pernet_subsys(&rt_secret_timer_ops))
3439                printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3440
3441        if (ip_rt_proc_init())
3442                printk(KERN_ERR "Unable to create route proc files\n");
3443#ifdef CONFIG_XFRM
3444        xfrm_init();
3445        xfrm4_init();
3446#endif
3447        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3448
3449#ifdef CONFIG_SYSCTL
3450        register_pernet_subsys(&sysctl_route_ops);
3451#endif
3452        return rc;
3453}
3454
3455#ifdef CONFIG_SYSCTL
3456/*
3457 * We really need to sanitize the damn ipv4 init order, then all
3458 * this nonsense will go away.
3459 */
3460void __init ip_static_sysctl_init(void)
3461{
3462        register_sysctl_paths(ipv4_path, ipv4_skeleton);
3463}
3464#endif
3465
3466EXPORT_SYMBOL(__ip_select_ident);
3467EXPORT_SYMBOL(ip_route_input);
3468EXPORT_SYMBOL(ip_route_output_key);
3469
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.