linux/net/ipv4/route.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              ROUTE - implementation of the IP router.
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13 *
  14 * Fixes:
  15 *              Alan Cox        :       Verify area fixes.
  16 *              Alan Cox        :       cli() protects routing changes
  17 *              Rui Oliveira    :       ICMP routing table updates
  18 *              (rco@di.uminho.pt)      Routing table insertion and update
  19 *              Linus Torvalds  :       Rewrote bits to be sensible
  20 *              Alan Cox        :       Added BSD route gw semantics
  21 *              Alan Cox        :       Super /proc >4K
  22 *              Alan Cox        :       MTU in route table
  23 *              Alan Cox        :       MSS actually. Also added the window
  24 *                                      clamper.
  25 *              Sam Lantinga    :       Fixed route matching in rt_del()
  26 *              Alan Cox        :       Routing cache support.
  27 *              Alan Cox        :       Removed compatibility cruft.
  28 *              Alan Cox        :       RTF_REJECT support.
  29 *              Alan Cox        :       TCP irtt support.
  30 *              Jonathan Naylor :       Added Metric support.
  31 *      Miquel van Smoorenburg  :       BSD API fixes.
  32 *      Miquel van Smoorenburg  :       Metrics.
  33 *              Alan Cox        :       Use __u32 properly
  34 *              Alan Cox        :       Aligned routing errors more closely with BSD
  35 *                                      our system is still very different.
  36 *              Alan Cox        :       Faster /proc handling
  37 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38 *                                      routing caches and better behaviour.
  39 *
  40 *              Olaf Erb        :       irtt wasn't being copied right.
  41 *              Bjorn Ekwall    :       Kerneld route support.
  42 *              Alan Cox        :       Multicast fixed (I hope)
  43 *              Pavel Krauz     :       Limited broadcast fixed
  44 *              Mike McLagan    :       Routing by source
  45 *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46 *                                      route.c and rewritten from scratch.
  47 *              Andi Kleen      :       Load-limit warning messages.
  48 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52 *              Marc Boucher    :       routing by fwmark
  53 *      Robert Olsson           :       Added rt_cache statistics
  54 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55 *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56 *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57 *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58 *
  59 *              This program is free software; you can redistribute it and/or
  60 *              modify it under the terms of the GNU General Public License
  61 *              as published by the Free Software Foundation; either version
  62 *              2 of the License, or (at your option) any later version.
  63 */
  64
  65#include <linux/module.h>
  66#include <asm/uaccess.h>
  67#include <asm/system.h>
  68#include <linux/bitops.h>
  69#include <linux/types.h>
  70#include <linux/kernel.h>
  71#include <linux/mm.h>
  72#include <linux/bootmem.h>
  73#include <linux/string.h>
  74#include <linux/socket.h>
  75#include <linux/sockios.h>
  76#include <linux/errno.h>
  77#include <linux/in.h>
  78#include <linux/inet.h>
  79#include <linux/netdevice.h>
  80#include <linux/proc_fs.h>
  81#include <linux/init.h>
  82#include <linux/workqueue.h>
  83#include <linux/skbuff.h>
  84#include <linux/inetdevice.h>
  85#include <linux/igmp.h>
  86#include <linux/pkt_sched.h>
  87#include <linux/mroute.h>
  88#include <linux/netfilter_ipv4.h>
  89#include <linux/random.h>
  90#include <linux/jhash.h>
  91#include <linux/rcupdate.h>
  92#include <linux/times.h>
  93#include <linux/slab.h>
  94#include <linux/prefetch.h>
  95#include <net/dst.h>
  96#include <net/net_namespace.h>
  97#include <net/protocol.h>
  98#include <net/ip.h>
  99#include <net/route.h>
 100#include <net/inetpeer.h>
 101#include <net/sock.h>
 102#include <net/ip_fib.h>
 103#include <net/arp.h>
 104#include <net/tcp.h>
 105#include <net/icmp.h>
 106#include <net/xfrm.h>
 107#include <net/netevent.h>
 108#include <net/rtnetlink.h>
 109#ifdef CONFIG_SYSCTL
 110#include <linux/sysctl.h>
 111#endif
 112#include <net/secure_seq.h>
 113
 114#define RT_FL_TOS(oldflp4) \
 115        ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 116
 117#define IP_MAX_MTU      0xFFF0
 118
 119#define RT_GC_TIMEOUT (300*HZ)
 120
 121static int ip_rt_max_size;
 122static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 123static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 124static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 125static int ip_rt_redirect_number __read_mostly  = 9;
 126static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 127static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 128static int ip_rt_error_cost __read_mostly       = HZ;
 129static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 130static int ip_rt_gc_elasticity __read_mostly    = 8;
 131static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 132static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 133static int ip_rt_min_advmss __read_mostly       = 256;
 134static int rt_chain_length_max __read_mostly    = 20;
 135
 136static struct delayed_work expires_work;
 137static unsigned long expires_ljiffies;
 138
 139/*
 140 *      Interface to generic destination cache.
 141 */
 142
 143static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 144static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 145static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 146static void              ipv4_dst_destroy(struct dst_entry *dst);
 147static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 148static void              ipv4_link_failure(struct sk_buff *skb);
 149static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 150static int rt_garbage_collect(struct dst_ops *ops);
 151
 152static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 153                            int how)
 154{
 155}
 156
 157static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 158{
 159        struct rtable *rt = (struct rtable *) dst;
 160        struct inet_peer *peer;
 161        u32 *p = NULL;
 162
 163        if (!rt->peer)
 164                rt_bind_peer(rt, rt->rt_dst, 1);
 165
 166        peer = rt->peer;
 167        if (peer) {
 168                u32 *old_p = __DST_METRICS_PTR(old);
 169                unsigned long prev, new;
 170
 171                p = peer->metrics;
 172                if (inet_metrics_new(peer))
 173                        memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 174
 175                new = (unsigned long) p;
 176                prev = cmpxchg(&dst->_metrics, old, new);
 177
 178                if (prev != old) {
 179                        p = __DST_METRICS_PTR(prev);
 180                        if (prev & DST_METRICS_READ_ONLY)
 181                                p = NULL;
 182                } else {
 183                        if (rt->fi) {
 184                                fib_info_put(rt->fi);
 185                                rt->fi = NULL;
 186                        }
 187                }
 188        }
 189        return p;
 190}
 191
 192static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 193
 194static struct dst_ops ipv4_dst_ops = {
 195        .family =               AF_INET,
 196        .protocol =             cpu_to_be16(ETH_P_IP),
 197        .gc =                   rt_garbage_collect,
 198        .check =                ipv4_dst_check,
 199        .default_advmss =       ipv4_default_advmss,
 200        .mtu =                  ipv4_mtu,
 201        .cow_metrics =          ipv4_cow_metrics,
 202        .destroy =              ipv4_dst_destroy,
 203        .ifdown =               ipv4_dst_ifdown,
 204        .negative_advice =      ipv4_negative_advice,
 205        .link_failure =         ipv4_link_failure,
 206        .update_pmtu =          ip_rt_update_pmtu,
 207        .local_out =            __ip_local_out,
 208        .neigh_lookup =         ipv4_neigh_lookup,
 209};
 210
 211#define ECN_OR_COST(class)      TC_PRIO_##class
 212
 213const __u8 ip_tos2prio[16] = {
 214        TC_PRIO_BESTEFFORT,
 215        ECN_OR_COST(BESTEFFORT),
 216        TC_PRIO_BESTEFFORT,
 217        ECN_OR_COST(BESTEFFORT),
 218        TC_PRIO_BULK,
 219        ECN_OR_COST(BULK),
 220        TC_PRIO_BULK,
 221        ECN_OR_COST(BULK),
 222        TC_PRIO_INTERACTIVE,
 223        ECN_OR_COST(INTERACTIVE),
 224        TC_PRIO_INTERACTIVE,
 225        ECN_OR_COST(INTERACTIVE),
 226        TC_PRIO_INTERACTIVE_BULK,
 227        ECN_OR_COST(INTERACTIVE_BULK),
 228        TC_PRIO_INTERACTIVE_BULK,
 229        ECN_OR_COST(INTERACTIVE_BULK)
 230};
 231
 232
 233/*
 234 * Route cache.
 235 */
 236
 237/* The locking scheme is rather straight forward:
 238 *
 239 * 1) Read-Copy Update protects the buckets of the central route hash.
 240 * 2) Only writers remove entries, and they hold the lock
 241 *    as they look at rtable reference counts.
 242 * 3) Only readers acquire references to rtable entries,
 243 *    they do so with atomic increments and with the
 244 *    lock held.
 245 */
 246
 247struct rt_hash_bucket {
 248        struct rtable __rcu     *chain;
 249};
 250
 251#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 252        defined(CONFIG_PROVE_LOCKING)
 253/*
 254 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 255 * The size of this table is a power of two and depends on the number of CPUS.
 256 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 257 */
 258#ifdef CONFIG_LOCKDEP
 259# define RT_HASH_LOCK_SZ        256
 260#else
 261# if NR_CPUS >= 32
 262#  define RT_HASH_LOCK_SZ       4096
 263# elif NR_CPUS >= 16
 264#  define RT_HASH_LOCK_SZ       2048
 265# elif NR_CPUS >= 8
 266#  define RT_HASH_LOCK_SZ       1024
 267# elif NR_CPUS >= 4
 268#  define RT_HASH_LOCK_SZ       512
 269# else
 270#  define RT_HASH_LOCK_SZ       256
 271# endif
 272#endif
 273
 274static spinlock_t       *rt_hash_locks;
 275# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 276
 277static __init void rt_hash_lock_init(void)
 278{
 279        int i;
 280
 281        rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 282                        GFP_KERNEL);
 283        if (!rt_hash_locks)
 284                panic("IP: failed to allocate rt_hash_locks\n");
 285
 286        for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 287                spin_lock_init(&rt_hash_locks[i]);
 288}
 289#else
 290# define rt_hash_lock_addr(slot) NULL
 291
 292static inline void rt_hash_lock_init(void)
 293{
 294}
 295#endif
 296
 297static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 298static unsigned                 rt_hash_mask __read_mostly;
 299static unsigned int             rt_hash_log  __read_mostly;
 300
 301static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 302#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 303
 304static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 305                                   int genid)
 306{
 307        return jhash_3words((__force u32)daddr, (__force u32)saddr,
 308                            idx, genid)
 309                & rt_hash_mask;
 310}
 311
 312static inline int rt_genid(struct net *net)
 313{
 314        return atomic_read(&net->ipv4.rt_genid);
 315}
 316
 317#ifdef CONFIG_PROC_FS
 318struct rt_cache_iter_state {
 319        struct seq_net_private p;
 320        int bucket;
 321        int genid;
 322};
 323
 324static struct rtable *rt_cache_get_first(struct seq_file *seq)
 325{
 326        struct rt_cache_iter_state *st = seq->private;
 327        struct rtable *r = NULL;
 328
 329        for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 330                if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 331                        continue;
 332                rcu_read_lock_bh();
 333                r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 334                while (r) {
 335                        if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 336                            r->rt_genid == st->genid)
 337                                return r;
 338                        r = rcu_dereference_bh(r->dst.rt_next);
 339                }
 340                rcu_read_unlock_bh();
 341        }
 342        return r;
 343}
 344
 345static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 346                                          struct rtable *r)
 347{
 348        struct rt_cache_iter_state *st = seq->private;
 349
 350        r = rcu_dereference_bh(r->dst.rt_next);
 351        while (!r) {
 352                rcu_read_unlock_bh();
 353                do {
 354                        if (--st->bucket < 0)
 355                                return NULL;
 356                } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 357                rcu_read_lock_bh();
 358                r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 359        }
 360        return r;
 361}
 362
 363static struct rtable *rt_cache_get_next(struct seq_file *seq,
 364                                        struct rtable *r)
 365{
 366        struct rt_cache_iter_state *st = seq->private;
 367        while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 368                if (dev_net(r->dst.dev) != seq_file_net(seq))
 369                        continue;
 370                if (r->rt_genid == st->genid)
 371                        break;
 372        }
 373        return r;
 374}
 375
 376static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 377{
 378        struct rtable *r = rt_cache_get_first(seq);
 379
 380        if (r)
 381                while (pos && (r = rt_cache_get_next(seq, r)))
 382                        --pos;
 383        return pos ? NULL : r;
 384}
 385
 386static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 387{
 388        struct rt_cache_iter_state *st = seq->private;
 389        if (*pos)
 390                return rt_cache_get_idx(seq, *pos - 1);
 391        st->genid = rt_genid(seq_file_net(seq));
 392        return SEQ_START_TOKEN;
 393}
 394
 395static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 396{
 397        struct rtable *r;
 398
 399        if (v == SEQ_START_TOKEN)
 400                r = rt_cache_get_first(seq);
 401        else
 402                r = rt_cache_get_next(seq, v);
 403        ++*pos;
 404        return r;
 405}
 406
 407static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 408{
 409        if (v && v != SEQ_START_TOKEN)
 410                rcu_read_unlock_bh();
 411}
 412
 413static int rt_cache_seq_show(struct seq_file *seq, void *v)
 414{
 415        if (v == SEQ_START_TOKEN)
 416                seq_printf(seq, "%-127s\n",
 417                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 418                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 419                           "HHUptod\tSpecDst");
 420        else {
 421                struct rtable *r = v;
 422                struct neighbour *n;
 423                int len, HHUptod;
 424
 425                rcu_read_lock();
 426                n = dst_get_neighbour_noref(&r->dst);
 427                HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
 428                rcu_read_unlock();
 429
 430                seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 431                              "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 432                        r->dst.dev ? r->dst.dev->name : "*",
 433                        (__force u32)r->rt_dst,
 434                        (__force u32)r->rt_gateway,
 435                        r->rt_flags, atomic_read(&r->dst.__refcnt),
 436                        r->dst.__use, 0, (__force u32)r->rt_src,
 437                        dst_metric_advmss(&r->dst) + 40,
 438                        dst_metric(&r->dst, RTAX_WINDOW),
 439                        (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 440                              dst_metric(&r->dst, RTAX_RTTVAR)),
 441                        r->rt_key_tos,
 442                        -1,
 443                        HHUptod,
 444                        r->rt_spec_dst, &len);
 445
 446                seq_printf(seq, "%*s\n", 127 - len, "");
 447        }
 448        return 0;
 449}
 450
 451static const struct seq_operations rt_cache_seq_ops = {
 452        .start  = rt_cache_seq_start,
 453        .next   = rt_cache_seq_next,
 454        .stop   = rt_cache_seq_stop,
 455        .show   = rt_cache_seq_show,
 456};
 457
 458static int rt_cache_seq_open(struct inode *inode, struct file *file)
 459{
 460        return seq_open_net(inode, file, &rt_cache_seq_ops,
 461                        sizeof(struct rt_cache_iter_state));
 462}
 463
 464static const struct file_operations rt_cache_seq_fops = {
 465        .owner   = THIS_MODULE,
 466        .open    = rt_cache_seq_open,
 467        .read    = seq_read,
 468        .llseek  = seq_lseek,
 469        .release = seq_release_net,
 470};
 471
 472
 473static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 474{
 475        int cpu;
 476
 477        if (*pos == 0)
 478                return SEQ_START_TOKEN;
 479
 480        for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 481                if (!cpu_possible(cpu))
 482                        continue;
 483                *pos = cpu+1;
 484                return &per_cpu(rt_cache_stat, cpu);
 485        }
 486        return NULL;
 487}
 488
 489static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 490{
 491        int cpu;
 492
 493        for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 494                if (!cpu_possible(cpu))
 495                        continue;
 496                *pos = cpu+1;
 497                return &per_cpu(rt_cache_stat, cpu);
 498        }
 499        return NULL;
 500
 501}
 502
 503static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 504{
 505
 506}
 507
 508static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 509{
 510        struct rt_cache_stat *st = v;
 511
 512        if (v == SEQ_START_TOKEN) {
 513                seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 514                return 0;
 515        }
 516
 517        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 518                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 519                   dst_entries_get_slow(&ipv4_dst_ops),
 520                   st->in_hit,
 521                   st->in_slow_tot,
 522                   st->in_slow_mc,
 523                   st->in_no_route,
 524                   st->in_brd,
 525                   st->in_martian_dst,
 526                   st->in_martian_src,
 527
 528                   st->out_hit,
 529                   st->out_slow_tot,
 530                   st->out_slow_mc,
 531
 532                   st->gc_total,
 533                   st->gc_ignored,
 534                   st->gc_goal_miss,
 535                   st->gc_dst_overflow,
 536                   st->in_hlist_search,
 537                   st->out_hlist_search
 538                );
 539        return 0;
 540}
 541
 542static const struct seq_operations rt_cpu_seq_ops = {
 543        .start  = rt_cpu_seq_start,
 544        .next   = rt_cpu_seq_next,
 545        .stop   = rt_cpu_seq_stop,
 546        .show   = rt_cpu_seq_show,
 547};
 548
 549
 550static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 551{
 552        return seq_open(file, &rt_cpu_seq_ops);
 553}
 554
 555static const struct file_operations rt_cpu_seq_fops = {
 556        .owner   = THIS_MODULE,
 557        .open    = rt_cpu_seq_open,
 558        .read    = seq_read,
 559        .llseek  = seq_lseek,
 560        .release = seq_release,
 561};
 562
 563#ifdef CONFIG_IP_ROUTE_CLASSID
 564static int rt_acct_proc_show(struct seq_file *m, void *v)
 565{
 566        struct ip_rt_acct *dst, *src;
 567        unsigned int i, j;
 568
 569        dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 570        if (!dst)
 571                return -ENOMEM;
 572
 573        for_each_possible_cpu(i) {
 574                src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 575                for (j = 0; j < 256; j++) {
 576                        dst[j].o_bytes   += src[j].o_bytes;
 577                        dst[j].o_packets += src[j].o_packets;
 578                        dst[j].i_bytes   += src[j].i_bytes;
 579                        dst[j].i_packets += src[j].i_packets;
 580                }
 581        }
 582
 583        seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 584        kfree(dst);
 585        return 0;
 586}
 587
 588static int rt_acct_proc_open(struct inode *inode, struct file *file)
 589{
 590        return single_open(file, rt_acct_proc_show, NULL);
 591}
 592
 593static const struct file_operations rt_acct_proc_fops = {
 594        .owner          = THIS_MODULE,
 595        .open           = rt_acct_proc_open,
 596        .read           = seq_read,
 597        .llseek         = seq_lseek,
 598        .release        = single_release,
 599};
 600#endif
 601
 602static int __net_init ip_rt_do_proc_init(struct net *net)
 603{
 604        struct proc_dir_entry *pde;
 605
 606        pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 607                        &rt_cache_seq_fops);
 608        if (!pde)
 609                goto err1;
 610
 611        pde = proc_create("rt_cache", S_IRUGO,
 612                          net->proc_net_stat, &rt_cpu_seq_fops);
 613        if (!pde)
 614                goto err2;
 615
 616#ifdef CONFIG_IP_ROUTE_CLASSID
 617        pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 618        if (!pde)
 619                goto err3;
 620#endif
 621        return 0;
 622
 623#ifdef CONFIG_IP_ROUTE_CLASSID
 624err3:
 625        remove_proc_entry("rt_cache", net->proc_net_stat);
 626#endif
 627err2:
 628        remove_proc_entry("rt_cache", net->proc_net);
 629err1:
 630        return -ENOMEM;
 631}
 632
 633static void __net_exit ip_rt_do_proc_exit(struct net *net)
 634{
 635        remove_proc_entry("rt_cache", net->proc_net_stat);
 636        remove_proc_entry("rt_cache", net->proc_net);
 637#ifdef CONFIG_IP_ROUTE_CLASSID
 638        remove_proc_entry("rt_acct", net->proc_net);
 639#endif
 640}
 641
 642static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 643        .init = ip_rt_do_proc_init,
 644        .exit = ip_rt_do_proc_exit,
 645};
 646
 647static int __init ip_rt_proc_init(void)
 648{
 649        return register_pernet_subsys(&ip_rt_proc_ops);
 650}
 651
 652#else
 653static inline int ip_rt_proc_init(void)
 654{
 655        return 0;
 656}
 657#endif /* CONFIG_PROC_FS */
 658
 659static inline void rt_free(struct rtable *rt)
 660{
 661        call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 662}
 663
 664static inline void rt_drop(struct rtable *rt)
 665{
 666        ip_rt_put(rt);
 667        call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 668}
 669
 670static inline int rt_fast_clean(struct rtable *rth)
 671{
 672        /* Kill broadcast/multicast entries very aggresively, if they
 673           collide in hash table with more useful entries */
 674        return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 675                rt_is_input_route(rth) && rth->dst.rt_next;
 676}
 677
 678static inline int rt_valuable(struct rtable *rth)
 679{
 680        return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 681                (rth->peer && rth->peer->pmtu_expires);
 682}
 683
 684static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 685{
 686        unsigned long age;
 687        int ret = 0;
 688
 689        if (atomic_read(&rth->dst.__refcnt))
 690                goto out;
 691
 692        age = jiffies - rth->dst.lastuse;
 693        if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 694            (age <= tmo2 && rt_valuable(rth)))
 695                goto out;
 696        ret = 1;
 697out:    return ret;
 698}
 699
 700/* Bits of score are:
 701 * 31: very valuable
 702 * 30: not quite useless
 703 * 29..0: usage counter
 704 */
 705static inline u32 rt_score(struct rtable *rt)
 706{
 707        u32 score = jiffies - rt->dst.lastuse;
 708
 709        score = ~score & ~(3<<30);
 710
 711        if (rt_valuable(rt))
 712                score |= (1<<31);
 713
 714        if (rt_is_output_route(rt) ||
 715            !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 716                score |= (1<<30);
 717
 718        return score;
 719}
 720
 721static inline bool rt_caching(const struct net *net)
 722{
 723        return net->ipv4.current_rt_cache_rebuild_count <=
 724                net->ipv4.sysctl_rt_cache_rebuild_count;
 725}
 726
 727static inline bool compare_hash_inputs(const struct rtable *rt1,
 728                                       const struct rtable *rt2)
 729{
 730        return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 731                ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 732                (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 733}
 734
 735static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 736{
 737        return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 738                ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 739                (rt1->rt_mark ^ rt2->rt_mark) |
 740                (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 741                (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 742                (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 743}
 744
 745static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 746{
 747        return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 748}
 749
 750static inline int rt_is_expired(struct rtable *rth)
 751{
 752        return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 753}
 754
 755/*
 756 * Perform a full scan of hash table and free all entries.
 757 * Can be called by a softirq or a process.
 758 * In the later case, we want to be reschedule if necessary
 759 */
 760static void rt_do_flush(struct net *net, int process_context)
 761{
 762        unsigned int i;
 763        struct rtable *rth, *next;
 764
 765        for (i = 0; i <= rt_hash_mask; i++) {
 766                struct rtable __rcu **pprev;
 767                struct rtable *list;
 768
 769                if (process_context && need_resched())
 770                        cond_resched();
 771                rth = rcu_access_pointer(rt_hash_table[i].chain);
 772                if (!rth)
 773                        continue;
 774
 775                spin_lock_bh(rt_hash_lock_addr(i));
 776
 777                list = NULL;
 778                pprev = &rt_hash_table[i].chain;
 779                rth = rcu_dereference_protected(*pprev,
 780                        lockdep_is_held(rt_hash_lock_addr(i)));
 781
 782                while (rth) {
 783                        next = rcu_dereference_protected(rth->dst.rt_next,
 784                                lockdep_is_held(rt_hash_lock_addr(i)));
 785
 786                        if (!net ||
 787                            net_eq(dev_net(rth->dst.dev), net)) {
 788                                rcu_assign_pointer(*pprev, next);
 789                                rcu_assign_pointer(rth->dst.rt_next, list);
 790                                list = rth;
 791                        } else {
 792                                pprev = &rth->dst.rt_next;
 793                        }
 794                        rth = next;
 795                }
 796
 797                spin_unlock_bh(rt_hash_lock_addr(i));
 798
 799                for (; list; list = next) {
 800                        next = rcu_dereference_protected(list->dst.rt_next, 1);
 801                        rt_free(list);
 802                }
 803        }
 804}
 805
 806/*
 807 * While freeing expired entries, we compute average chain length
 808 * and standard deviation, using fixed-point arithmetic.
 809 * This to have an estimation of rt_chain_length_max
 810 *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 811 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 812 */
 813
 814#define FRACT_BITS 3
 815#define ONE (1UL << FRACT_BITS)
 816
 817/*
 818 * Given a hash chain and an item in this hash chain,
 819 * find if a previous entry has the same hash_inputs
 820 * (but differs on tos, mark or oif)
 821 * Returns 0 if an alias is found.
 822 * Returns ONE if rth has no alias before itself.
 823 */
 824static int has_noalias(const struct rtable *head, const struct rtable *rth)
 825{
 826        const struct rtable *aux = head;
 827
 828        while (aux != rth) {
 829                if (compare_hash_inputs(aux, rth))
 830                        return 0;
 831                aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 832        }
 833        return ONE;
 834}
 835
 836static void rt_check_expire(void)
 837{
 838        static unsigned int rover;
 839        unsigned int i = rover, goal;
 840        struct rtable *rth;
 841        struct rtable __rcu **rthp;
 842        unsigned long samples = 0;
 843        unsigned long sum = 0, sum2 = 0;
 844        unsigned long delta;
 845        u64 mult;
 846
 847        delta = jiffies - expires_ljiffies;
 848        expires_ljiffies = jiffies;
 849        mult = ((u64)delta) << rt_hash_log;
 850        if (ip_rt_gc_timeout > 1)
 851                do_div(mult, ip_rt_gc_timeout);
 852        goal = (unsigned int)mult;
 853        if (goal > rt_hash_mask)
 854                goal = rt_hash_mask + 1;
 855        for (; goal > 0; goal--) {
 856                unsigned long tmo = ip_rt_gc_timeout;
 857                unsigned long length;
 858
 859                i = (i + 1) & rt_hash_mask;
 860                rthp = &rt_hash_table[i].chain;
 861
 862                if (need_resched())
 863                        cond_resched();
 864
 865                samples++;
 866
 867                if (rcu_dereference_raw(*rthp) == NULL)
 868                        continue;
 869                length = 0;
 870                spin_lock_bh(rt_hash_lock_addr(i));
 871                while ((rth = rcu_dereference_protected(*rthp,
 872                                        lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 873                        prefetch(rth->dst.rt_next);
 874                        if (rt_is_expired(rth)) {
 875                                *rthp = rth->dst.rt_next;
 876                                rt_free(rth);
 877                                continue;
 878                        }
 879                        if (rth->dst.expires) {
 880                                /* Entry is expired even if it is in use */
 881                                if (time_before_eq(jiffies, rth->dst.expires)) {
 882nofree:
 883                                        tmo >>= 1;
 884                                        rthp = &rth->dst.rt_next;
 885                                        /*
 886                                         * We only count entries on
 887                                         * a chain with equal hash inputs once
 888                                         * so that entries for different QOS
 889                                         * levels, and other non-hash input
 890                                         * attributes don't unfairly skew
 891                                         * the length computation
 892                                         */
 893                                        length += has_noalias(rt_hash_table[i].chain, rth);
 894                                        continue;
 895                                }
 896                        } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 897                                goto nofree;
 898
 899                        /* Cleanup aged off entries. */
 900                        *rthp = rth->dst.rt_next;
 901                        rt_free(rth);
 902                }
 903                spin_unlock_bh(rt_hash_lock_addr(i));
 904                sum += length;
 905                sum2 += length*length;
 906        }
 907        if (samples) {
 908                unsigned long avg = sum / samples;
 909                unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 910                rt_chain_length_max = max_t(unsigned long,
 911                                        ip_rt_gc_elasticity,
 912                                        (avg + 4*sd) >> FRACT_BITS);
 913        }
 914        rover = i;
 915}
 916
 917/*
 918 * rt_worker_func() is run in process context.
 919 * we call rt_check_expire() to scan part of the hash table
 920 */
 921static void rt_worker_func(struct work_struct *work)
 922{
 923        rt_check_expire();
 924        schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 925}
 926
 927/*
 928 * Perturbation of rt_genid by a small quantity [1..256]
 929 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 930 * many times (2^24) without giving recent rt_genid.
 931 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 932 */
 933static void rt_cache_invalidate(struct net *net)
 934{
 935        unsigned char shuffle;
 936
 937        get_random_bytes(&shuffle, sizeof(shuffle));
 938        atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 939        inetpeer_invalidate_tree(AF_INET);
 940}
 941
 942/*
 943 * delay < 0  : invalidate cache (fast : entries will be deleted later)
 944 * delay >= 0 : invalidate & flush cache (can be long)
 945 */
 946void rt_cache_flush(struct net *net, int delay)
 947{
 948        rt_cache_invalidate(net);
 949        if (delay >= 0)
 950                rt_do_flush(net, !in_softirq());
 951}
 952
 953/* Flush previous cache invalidated entries from the cache */
 954void rt_cache_flush_batch(struct net *net)
 955{
 956        rt_do_flush(net, !in_softirq());
 957}
 958
 959static void rt_emergency_hash_rebuild(struct net *net)
 960{
 961        if (net_ratelimit())
 962                printk(KERN_WARNING "Route hash chain too long!\n");
 963        rt_cache_invalidate(net);
 964}
 965
 966/*
 967   Short description of GC goals.
 968
 969   We want to build algorithm, which will keep routing cache
 970   at some equilibrium point, when number of aged off entries
 971   is kept approximately equal to newly generated ones.
 972
 973   Current expiration strength is variable "expire".
 974   We try to adjust it dynamically, so that if networking
 975   is idle expires is large enough to keep enough of warm entries,
 976   and when load increases it reduces to limit cache size.
 977 */
 978
 979static int rt_garbage_collect(struct dst_ops *ops)
 980{
 981        static unsigned long expire = RT_GC_TIMEOUT;
 982        static unsigned long last_gc;
 983        static int rover;
 984        static int equilibrium;
 985        struct rtable *rth;
 986        struct rtable __rcu **rthp;
 987        unsigned long now = jiffies;
 988        int goal;
 989        int entries = dst_entries_get_fast(&ipv4_dst_ops);
 990
 991        /*
 992         * Garbage collection is pretty expensive,
 993         * do not make it too frequently.
 994         */
 995
 996        RT_CACHE_STAT_INC(gc_total);
 997
 998        if (now - last_gc < ip_rt_gc_min_interval &&
 999            entries < ip_rt_max_size) {
1000                RT_CACHE_STAT_INC(gc_ignored);
1001                goto out;
1002        }
1003
1004        entries = dst_entries_get_slow(&ipv4_dst_ops);
1005        /* Calculate number of entries, which we want to expire now. */
1006        goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1007        if (goal <= 0) {
1008                if (equilibrium < ipv4_dst_ops.gc_thresh)
1009                        equilibrium = ipv4_dst_ops.gc_thresh;
1010                goal = entries - equilibrium;
1011                if (goal > 0) {
1012                        equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1013                        goal = entries - equilibrium;
1014                }
1015        } else {
1016                /* We are in dangerous area. Try to reduce cache really
1017                 * aggressively.
1018                 */
1019                goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1020                equilibrium = entries - goal;
1021        }
1022
1023        if (now - last_gc >= ip_rt_gc_min_interval)
1024                last_gc = now;
1025
1026        if (goal <= 0) {
1027                equilibrium += goal;
1028                goto work_done;
1029        }
1030
1031        do {
1032                int i, k;
1033
1034                for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1035                        unsigned long tmo = expire;
1036
1037                        k = (k + 1) & rt_hash_mask;
1038                        rthp = &rt_hash_table[k].chain;
1039                        spin_lock_bh(rt_hash_lock_addr(k));
1040                        while ((rth = rcu_dereference_protected(*rthp,
1041                                        lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1042                                if (!rt_is_expired(rth) &&
1043                                        !rt_may_expire(rth, tmo, expire)) {
1044                                        tmo >>= 1;
1045                                        rthp = &rth->dst.rt_next;
1046                                        continue;
1047                                }
1048                                *rthp = rth->dst.rt_next;
1049                                rt_free(rth);
1050                                goal--;
1051                        }
1052                        spin_unlock_bh(rt_hash_lock_addr(k));
1053                        if (goal <= 0)
1054                                break;
1055                }
1056                rover = k;
1057
1058                if (goal <= 0)
1059                        goto work_done;
1060
1061                /* Goal is not achieved. We stop process if:
1062
1063                   - if expire reduced to zero. Otherwise, expire is halfed.
1064                   - if table is not full.
1065                   - if we are called from interrupt.
1066                   - jiffies check is just fallback/debug loop breaker.
1067                     We will not spin here for long time in any case.
1068                 */
1069
1070                RT_CACHE_STAT_INC(gc_goal_miss);
1071
1072                if (expire == 0)
1073                        break;
1074
1075                expire >>= 1;
1076
1077                if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1078                        goto out;
1079        } while (!in_softirq() && time_before_eq(jiffies, now));
1080
1081        if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1082                goto out;
1083        if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1084                goto out;
1085        if (net_ratelimit())
1086                printk(KERN_WARNING "dst cache overflow\n");
1087        RT_CACHE_STAT_INC(gc_dst_overflow);
1088        return 1;
1089
1090work_done:
1091        expire += ip_rt_gc_min_interval;
1092        if (expire > ip_rt_gc_timeout ||
1093            dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1094            dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1095                expire = ip_rt_gc_timeout;
1096out:    return 0;
1097}
1098
1099/*
1100 * Returns number of entries in a hash chain that have different hash_inputs
1101 */
1102static int slow_chain_length(const struct rtable *head)
1103{
1104        int length = 0;
1105        const struct rtable *rth = head;
1106
1107        while (rth) {
1108                length += has_noalias(head, rth);
1109                rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1110        }
1111        return length >> FRACT_BITS;
1112}
1113
1114static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1115{
1116        static const __be32 inaddr_any = 0;
1117        struct net_device *dev = dst->dev;
1118        const __be32 *pkey = daddr;
1119        struct neighbour *n;
1120
1121        if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1122                pkey = &inaddr_any;
1123
1124        n = __ipv4_neigh_lookup(&arp_tbl, dev, *(__force u32 *)pkey);
1125        if (n)
1126                return n;
1127        return neigh_create(&arp_tbl, pkey, dev);
1128}
1129
1130static int rt_bind_neighbour(struct rtable *rt)
1131{
1132        struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1133        if (IS_ERR(n))
1134                return PTR_ERR(n);
1135        dst_set_neighbour(&rt->dst, n);
1136
1137        return 0;
1138}
1139
1140static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1141                                     struct sk_buff *skb, int ifindex)
1142{
1143        struct rtable   *rth, *cand;
1144        struct rtable __rcu **rthp, **candp;
1145        unsigned long   now;
1146        u32             min_score;
1147        int             chain_length;
1148        int attempts = !in_softirq();
1149
1150restart:
1151        chain_length = 0;
1152        min_score = ~(u32)0;
1153        cand = NULL;
1154        candp = NULL;
1155        now = jiffies;
1156
1157        if (!rt_caching(dev_net(rt->dst.dev))) {
1158                /*
1159                 * If we're not caching, just tell the caller we
1160                 * were successful and don't touch the route.  The
1161                 * caller hold the sole reference to the cache entry, and
1162                 * it will be released when the caller is done with it.
1163                 * If we drop it here, the callers have no way to resolve routes
1164                 * when we're not caching.  Instead, just point *rp at rt, so
1165                 * the caller gets a single use out of the route
1166                 * Note that we do rt_free on this new route entry, so that
1167                 * once its refcount hits zero, we are still able to reap it
1168                 * (Thanks Alexey)
1169                 * Note: To avoid expensive rcu stuff for this uncached dst,
1170                 * we set DST_NOCACHE so that dst_release() can free dst without
1171                 * waiting a grace period.
1172                 */
1173
1174                rt->dst.flags |= DST_NOCACHE;
1175                if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1176                        int err = rt_bind_neighbour(rt);
1177                        if (err) {
1178                                if (net_ratelimit())
1179                                        printk(KERN_WARNING
1180                                            "Neighbour table failure & not caching routes.\n");
1181                                ip_rt_put(rt);
1182                                return ERR_PTR(err);
1183                        }
1184                }
1185
1186                goto skip_hashing;
1187        }
1188
1189        rthp = &rt_hash_table[hash].chain;
1190
1191        spin_lock_bh(rt_hash_lock_addr(hash));
1192        while ((rth = rcu_dereference_protected(*rthp,
1193                        lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1194                if (rt_is_expired(rth)) {
1195                        *rthp = rth->dst.rt_next;
1196                        rt_free(rth);
1197                        continue;
1198                }
1199                if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1200                        /* Put it first */
1201                        *rthp = rth->dst.rt_next;
1202                        /*
1203                         * Since lookup is lockfree, the deletion
1204                         * must be visible to another weakly ordered CPU before
1205                         * the insertion at the start of the hash chain.
1206                         */
1207                        rcu_assign_pointer(rth->dst.rt_next,
1208                                           rt_hash_table[hash].chain);
1209                        /*
1210                         * Since lookup is lockfree, the update writes
1211                         * must be ordered for consistency on SMP.
1212                         */
1213                        rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1214
1215                        dst_use(&rth->dst, now);
1216                        spin_unlock_bh(rt_hash_lock_addr(hash));
1217
1218                        rt_drop(rt);
1219                        if (skb)
1220                                skb_dst_set(skb, &rth->dst);
1221                        return rth;
1222                }
1223
1224                if (!atomic_read(&rth->dst.__refcnt)) {
1225                        u32 score = rt_score(rth);
1226
1227                        if (score <= min_score) {
1228                                cand = rth;
1229                                candp = rthp;
1230                                min_score = score;
1231                        }
1232                }
1233
1234                chain_length++;
1235
1236                rthp = &rth->dst.rt_next;
1237        }
1238
1239        if (cand) {
1240                /* ip_rt_gc_elasticity used to be average length of chain
1241                 * length, when exceeded gc becomes really aggressive.
1242                 *
1243                 * The second limit is less certain. At the moment it allows
1244                 * only 2 entries per bucket. We will see.
1245                 */
1246                if (chain_length > ip_rt_gc_elasticity) {
1247                        *candp = cand->dst.rt_next;
1248                        rt_free(cand);
1249                }
1250        } else {
1251                if (chain_length > rt_chain_length_max &&
1252                    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1253                        struct net *net = dev_net(rt->dst.dev);
1254                        int num = ++net->ipv4.current_rt_cache_rebuild_count;
1255                        if (!rt_caching(net)) {
1256                                printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1257                                        rt->dst.dev->name, num);
1258                        }
1259                        rt_emergency_hash_rebuild(net);
1260                        spin_unlock_bh(rt_hash_lock_addr(hash));
1261
1262                        hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1263                                        ifindex, rt_genid(net));
1264                        goto restart;
1265                }
1266        }
1267
1268        /* Try to bind route to arp only if it is output
1269           route or unicast forwarding path.
1270         */
1271        if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1272                int err = rt_bind_neighbour(rt);
1273                if (err) {
1274                        spin_unlock_bh(rt_hash_lock_addr(hash));
1275
1276                        if (err != -ENOBUFS) {
1277                                rt_drop(rt);
1278                                return ERR_PTR(err);
1279                        }
1280
1281                        /* Neighbour tables are full and nothing
1282                           can be released. Try to shrink route cache,
1283                           it is most likely it holds some neighbour records.
1284                         */
1285                        if (attempts-- > 0) {
1286                                int saved_elasticity = ip_rt_gc_elasticity;
1287                                int saved_int = ip_rt_gc_min_interval;
1288                                ip_rt_gc_elasticity     = 1;
1289                                ip_rt_gc_min_interval   = 0;
1290                                rt_garbage_collect(&ipv4_dst_ops);
1291                                ip_rt_gc_min_interval   = saved_int;
1292                                ip_rt_gc_elasticity     = saved_elasticity;
1293                                goto restart;
1294                        }
1295
1296                        if (net_ratelimit())
1297                                printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1298                        rt_drop(rt);
1299                        return ERR_PTR(-ENOBUFS);
1300                }
1301        }
1302
1303        rt->dst.rt_next = rt_hash_table[hash].chain;
1304
1305        /*
1306         * Since lookup is lockfree, we must make sure
1307         * previous writes to rt are committed to memory
1308         * before making rt visible to other CPUS.
1309         */
1310        rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1311
1312        spin_unlock_bh(rt_hash_lock_addr(hash));
1313
1314skip_hashing:
1315        if (skb)
1316                skb_dst_set(skb, &rt->dst);
1317        return rt;
1318}
1319
1320static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1321
1322static u32 rt_peer_genid(void)
1323{
1324        return atomic_read(&__rt_peer_genid);
1325}
1326
1327void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1328{
1329        struct inet_peer *peer;
1330
1331        peer = inet_getpeer_v4(daddr, create);
1332
1333        if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1334                inet_putpeer(peer);
1335        else
1336                rt->rt_peer_genid = rt_peer_genid();
1337}
1338
1339/*
1340 * Peer allocation may fail only in serious out-of-memory conditions.  However
1341 * we still can generate some output.
1342 * Random ID selection looks a bit dangerous because we have no chances to
1343 * select ID being unique in a reasonable period of time.
1344 * But broken packet identifier may be better than no packet at all.
1345 */
1346static void ip_select_fb_ident(struct iphdr *iph)
1347{
1348        static DEFINE_SPINLOCK(ip_fb_id_lock);
1349        static u32 ip_fallback_id;
1350        u32 salt;
1351
1352        spin_lock_bh(&ip_fb_id_lock);
1353        salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1354        iph->id = htons(salt & 0xFFFF);
1355        ip_fallback_id = salt;
1356        spin_unlock_bh(&ip_fb_id_lock);
1357}
1358
1359void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1360{
1361        struct rtable *rt = (struct rtable *) dst;
1362
1363        if (rt && !(rt->dst.flags & DST_NOPEER)) {
1364                if (rt->peer == NULL)
1365                        rt_bind_peer(rt, rt->rt_dst, 1);
1366
1367                /* If peer is attached to destination, it is never detached,
1368                   so that we need not to grab a lock to dereference it.
1369                 */
1370                if (rt->peer) {
1371                        iph->id = htons(inet_getid(rt->peer, more));
1372                        return;
1373                }
1374        } else if (!rt)
1375                printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1376                       __builtin_return_address(0));
1377
1378        ip_select_fb_ident(iph);
1379}
1380EXPORT_SYMBOL(__ip_select_ident);
1381
1382static void rt_del(unsigned hash, struct rtable *rt)
1383{
1384        struct rtable __rcu **rthp;
1385        struct rtable *aux;
1386
1387        rthp = &rt_hash_table[hash].chain;
1388        spin_lock_bh(rt_hash_lock_addr(hash));
1389        ip_rt_put(rt);
1390        while ((aux = rcu_dereference_protected(*rthp,
1391                        lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1392                if (aux == rt || rt_is_expired(aux)) {
1393                        *rthp = aux->dst.rt_next;
1394                        rt_free(aux);
1395                        continue;
1396                }
1397                rthp = &aux->dst.rt_next;
1398        }
1399        spin_unlock_bh(rt_hash_lock_addr(hash));
1400}
1401
1402static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1403{
1404        struct rtable *rt = (struct rtable *) dst;
1405        __be32 orig_gw = rt->rt_gateway;
1406        struct neighbour *n, *old_n;
1407
1408        dst_confirm(&rt->dst);
1409
1410        rt->rt_gateway = peer->redirect_learned.a4;
1411
1412        n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1413        if (IS_ERR(n)) {
1414                rt->rt_gateway = orig_gw;
1415                return;
1416        }
1417        old_n = xchg(&rt->dst._neighbour, n);
1418        if (old_n)
1419                neigh_release(old_n);
1420        if (!(n->nud_state & NUD_VALID)) {
1421                neigh_event_send(n, NULL);
1422        } else {
1423                rt->rt_flags |= RTCF_REDIRECTED;
1424                call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1425        }
1426}
1427
1428/* called in rcu_read_lock() section */
1429void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1430                    __be32 saddr, struct net_device *dev)
1431{
1432        int s, i;
1433        struct in_device *in_dev = __in_dev_get_rcu(dev);
1434        __be32 skeys[2] = { saddr, 0 };
1435        int    ikeys[2] = { dev->ifindex, 0 };
1436        struct inet_peer *peer;
1437        struct net *net;
1438
1439        if (!in_dev)
1440                return;
1441
1442        net = dev_net(dev);
1443        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1444            ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1445            ipv4_is_zeronet(new_gw))
1446                goto reject_redirect;
1447
1448        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1449                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1450                        goto reject_redirect;
1451                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1452                        goto reject_redirect;
1453        } else {
1454                if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1455                        goto reject_redirect;
1456        }
1457
1458        for (s = 0; s < 2; s++) {
1459                for (i = 0; i < 2; i++) {
1460                        unsigned int hash;
1461                        struct rtable __rcu **rthp;
1462                        struct rtable *rt;
1463
1464                        hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1465
1466                        rthp = &rt_hash_table[hash].chain;
1467
1468                        while ((rt = rcu_dereference(*rthp)) != NULL) {
1469                                rthp = &rt->dst.rt_next;
1470
1471                                if (rt->rt_key_dst != daddr ||
1472                                    rt->rt_key_src != skeys[s] ||
1473                                    rt->rt_oif != ikeys[i] ||
1474                                    rt_is_input_route(rt) ||
1475                                    rt_is_expired(rt) ||
1476                                    !net_eq(dev_net(rt->dst.dev), net) ||
1477                                    rt->dst.error ||
1478                                    rt->dst.dev != dev ||
1479                                    rt->rt_gateway != old_gw)
1480                                        continue;
1481
1482                                if (!rt->peer)
1483                                        rt_bind_peer(rt, rt->rt_dst, 1);
1484
1485                                peer = rt->peer;
1486                                if (peer) {
1487                                        if (peer->redirect_learned.a4 != new_gw) {
1488                                                peer->redirect_learned.a4 = new_gw;
1489                                                atomic_inc(&__rt_peer_genid);
1490                                        }
1491                                        check_peer_redir(&rt->dst, peer);
1492                                }
1493                        }
1494                }
1495        }
1496        return;
1497
1498reject_redirect:
1499#ifdef CONFIG_IP_ROUTE_VERBOSE
1500        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1501                printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1502                        "  Advised path = %pI4 -> %pI4\n",
1503                       &old_gw, dev->name, &new_gw,
1504                       &saddr, &daddr);
1505#endif
1506        ;
1507}
1508
1509static bool peer_pmtu_expired(struct inet_peer *peer)
1510{
1511        unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1512
1513        return orig &&
1514               time_after_eq(jiffies, orig) &&
1515               cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1516}
1517
1518static bool peer_pmtu_cleaned(struct inet_peer *peer)
1519{
1520        unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1521
1522        return orig &&
1523               cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1524}
1525
1526static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1527{
1528        struct rtable *rt = (struct rtable *)dst;
1529        struct dst_entry *ret = dst;
1530
1531        if (rt) {
1532                if (dst->obsolete > 0) {
1533                        ip_rt_put(rt);
1534                        ret = NULL;
1535                } else if (rt->rt_flags & RTCF_REDIRECTED) {
1536                        unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1537                                                rt->rt_oif,
1538                                                rt_genid(dev_net(dst->dev)));
1539                        rt_del(hash, rt);
1540                        ret = NULL;
1541                } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1542                        dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1543                }
1544        }
1545        return ret;
1546}
1547
1548/*
1549 * Algorithm:
1550 *      1. The first ip_rt_redirect_number redirects are sent
1551 *         with exponential backoff, then we stop sending them at all,
1552 *         assuming that the host ignores our redirects.
1553 *      2. If we did not see packets requiring redirects
1554 *         during ip_rt_redirect_silence, we assume that the host
1555 *         forgot redirected route and start to send redirects again.
1556 *
1557 * This algorithm is much cheaper and more intelligent than dumb load limiting
1558 * in icmp.c.
1559 *
1560 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1561 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1562 */
1563
1564void ip_rt_send_redirect(struct sk_buff *skb)
1565{
1566        struct rtable *rt = skb_rtable(skb);
1567        struct in_device *in_dev;
1568        struct inet_peer *peer;
1569        int log_martians;
1570
1571        rcu_read_lock();
1572        in_dev = __in_dev_get_rcu(rt->dst.dev);
1573        if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1574                rcu_read_unlock();
1575                return;
1576        }
1577        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1578        rcu_read_unlock();
1579
1580        if (!rt->peer)
1581                rt_bind_peer(rt, rt->rt_dst, 1);
1582        peer = rt->peer;
1583        if (!peer) {
1584                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1585                return;
1586        }
1587
1588        /* No redirected packets during ip_rt_redirect_silence;
1589         * reset the algorithm.
1590         */
1591        if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1592                peer->rate_tokens = 0;
1593
1594        /* Too many ignored redirects; do not send anything
1595         * set dst.rate_last to the last seen redirected packet.
1596         */
1597        if (peer->rate_tokens >= ip_rt_redirect_number) {
1598                peer->rate_last = jiffies;
1599                return;
1600        }
1601
1602        /* Check for load limit; set rate_last to the latest sent
1603         * redirect.
1604         */
1605        if (peer->rate_tokens == 0 ||
1606            time_after(jiffies,
1607                       (peer->rate_last +
1608                        (ip_rt_redirect_load << peer->rate_tokens)))) {
1609                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1610                peer->rate_last = jiffies;
1611                ++peer->rate_tokens;
1612#ifdef CONFIG_IP_ROUTE_VERBOSE
1613                if (log_martians &&
1614                    peer->rate_tokens == ip_rt_redirect_number &&
1615                    net_ratelimit())
1616                        printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1617                               &ip_hdr(skb)->saddr, rt->rt_iif,
1618                                &rt->rt_dst, &rt->rt_gateway);
1619#endif
1620        }
1621}
1622
1623static int ip_error(struct sk_buff *skb)
1624{
1625        struct rtable *rt = skb_rtable(skb);
1626        struct inet_peer *peer;
1627        unsigned long now;
1628        bool send;
1629        int code;
1630
1631        switch (rt->dst.error) {
1632        case EINVAL:
1633        default:
1634                goto out;
1635        case EHOSTUNREACH:
1636                code = ICMP_HOST_UNREACH;
1637                break;
1638        case ENETUNREACH:
1639                code = ICMP_NET_UNREACH;
1640                IP_INC_STATS_BH(dev_net(rt->dst.dev),
1641                                IPSTATS_MIB_INNOROUTES);
1642                break;
1643        case EACCES:
1644                code = ICMP_PKT_FILTERED;
1645                break;
1646        }
1647
1648        if (!rt->peer)
1649                rt_bind_peer(rt, rt->rt_dst, 1);
1650        peer = rt->peer;
1651
1652        send = true;
1653        if (peer) {
1654                now = jiffies;
1655                peer->rate_tokens += now - peer->rate_last;
1656                if (peer->rate_tokens > ip_rt_error_burst)
1657                        peer->rate_tokens = ip_rt_error_burst;
1658                peer->rate_last = now;
1659                if (peer->rate_tokens >= ip_rt_error_cost)
1660                        peer->rate_tokens -= ip_rt_error_cost;
1661                else
1662                        send = false;
1663        }
1664        if (send)
1665                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1666
1667out:    kfree_skb(skb);
1668        return 0;
1669}
1670
1671/*
1672 *      The last two values are not from the RFC but
1673 *      are needed for AMPRnet AX.25 paths.
1674 */
1675
1676static const unsigned short mtu_plateau[] =
1677{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1678
1679static inline unsigned short guess_mtu(unsigned short old_mtu)
1680{
1681        int i;
1682
1683        for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1684                if (old_mtu > mtu_plateau[i])
1685                        return mtu_plateau[i];
1686        return 68;
1687}
1688
1689unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1690                                 unsigned short new_mtu,
1691                                 struct net_device *dev)
1692{
1693        unsigned short old_mtu = ntohs(iph->tot_len);
1694        unsigned short est_mtu = 0;
1695        struct inet_peer *peer;
1696
1697        peer = inet_getpeer_v4(iph->daddr, 1);
1698        if (peer) {
1699                unsigned short mtu = new_mtu;
1700
1701                if (new_mtu < 68 || new_mtu >= old_mtu) {
1702                        /* BSD 4.2 derived systems incorrectly adjust
1703                         * tot_len by the IP header length, and report
1704                         * a zero MTU in the ICMP message.
1705                         */
1706                        if (mtu == 0 &&
1707                            old_mtu >= 68 + (iph->ihl << 2))
1708                                old_mtu -= iph->ihl << 2;
1709                        mtu = guess_mtu(old_mtu);
1710                }
1711
1712                if (mtu < ip_rt_min_pmtu)
1713                        mtu = ip_rt_min_pmtu;
1714                if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1715                        unsigned long pmtu_expires;
1716
1717                        pmtu_expires = jiffies + ip_rt_mtu_expires;
1718                        if (!pmtu_expires)
1719                                pmtu_expires = 1UL;
1720
1721                        est_mtu = mtu;
1722                        peer->pmtu_learned = mtu;
1723                        peer->pmtu_expires = pmtu_expires;
1724                        atomic_inc(&__rt_peer_genid);
1725                }
1726
1727                inet_putpeer(peer);
1728        }
1729        return est_mtu ? : new_mtu;
1730}
1731
1732static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1733{
1734        unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1735
1736        if (!expires)
1737                return;
1738        if (time_before(jiffies, expires)) {
1739                u32 orig_dst_mtu = dst_mtu(dst);
1740                if (peer->pmtu_learned < orig_dst_mtu) {
1741                        if (!peer->pmtu_orig)
1742                                peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1743                        dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1744                }
1745        } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1746                dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1747}
1748
1749static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1750{
1751        struct rtable *rt = (struct rtable *) dst;
1752        struct inet_peer *peer;
1753
1754        dst_confirm(dst);
1755
1756        if (!rt->peer)
1757                rt_bind_peer(rt, rt->rt_dst, 1);
1758        peer = rt->peer;
1759        if (peer) {
1760                unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1761
1762                if (mtu < ip_rt_min_pmtu)
1763                        mtu = ip_rt_min_pmtu;
1764                if (!pmtu_expires || mtu < peer->pmtu_learned) {
1765
1766                        pmtu_expires = jiffies + ip_rt_mtu_expires;
1767                        if (!pmtu_expires)
1768                                pmtu_expires = 1UL;
1769
1770                        peer->pmtu_learned = mtu;
1771                        peer->pmtu_expires = pmtu_expires;
1772
1773                        atomic_inc(&__rt_peer_genid);
1774                        rt->rt_peer_genid = rt_peer_genid();
1775                }
1776                check_peer_pmtu(dst, peer);
1777        }
1778}
1779
1780
1781static void ipv4_validate_peer(struct rtable *rt)
1782{
1783        if (rt->rt_peer_genid != rt_peer_genid()) {
1784                struct inet_peer *peer;
1785
1786                if (!rt->peer)
1787                        rt_bind_peer(rt, rt->rt_dst, 0);
1788
1789                peer = rt->peer;
1790                if (peer) {
1791                        check_peer_pmtu(&rt->dst, peer);
1792
1793                        if (peer->redirect_learned.a4 &&
1794                            peer->redirect_learned.a4 != rt->rt_gateway)
1795                                check_peer_redir(&rt->dst, peer);
1796                }
1797
1798                rt->rt_peer_genid = rt_peer_genid();
1799        }
1800}
1801
1802static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1803{
1804        struct rtable *rt = (struct rtable *) dst;
1805
1806        if (rt_is_expired(rt))
1807                return NULL;
1808        ipv4_validate_peer(rt);
1809        return dst;
1810}
1811
1812static void ipv4_dst_destroy(struct dst_entry *dst)
1813{
1814        struct rtable *rt = (struct rtable *) dst;
1815        struct inet_peer *peer = rt->peer;
1816
1817        if (rt->fi) {
1818                fib_info_put(rt->fi);
1819                rt->fi = NULL;
1820        }
1821        if (peer) {
1822                rt->peer = NULL;
1823                inet_putpeer(peer);
1824        }
1825}
1826
1827
1828static void ipv4_link_failure(struct sk_buff *skb)
1829{
1830        struct rtable *rt;
1831
1832        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1833
1834        rt = skb_rtable(skb);
1835        if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1836                dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1837}
1838
1839static int ip_rt_bug(struct sk_buff *skb)
1840{
1841        printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1842                &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1843                skb->dev ? skb->dev->name : "?");
1844        kfree_skb(skb);
1845        WARN_ON(1);
1846        return 0;
1847}
1848
1849/*
1850   We do not cache source address of outgoing interface,
1851   because it is used only by IP RR, TS and SRR options,
1852   so that it out of fast path.
1853
1854   BTW remember: "addr" is allowed to be not aligned
1855   in IP options!
1856 */
1857
1858void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1859{
1860        __be32 src;
1861
1862        if (rt_is_output_route(rt))
1863                src = ip_hdr(skb)->saddr;
1864        else {
1865                struct fib_result res;
1866                struct flowi4 fl4;
1867                struct iphdr *iph;
1868
1869                iph = ip_hdr(skb);
1870
1871                memset(&fl4, 0, sizeof(fl4));
1872                fl4.daddr = iph->daddr;
1873                fl4.saddr = iph->saddr;
1874                fl4.flowi4_tos = RT_TOS(iph->tos);
1875                fl4.flowi4_oif = rt->dst.dev->ifindex;
1876                fl4.flowi4_iif = skb->dev->ifindex;
1877                fl4.flowi4_mark = skb->mark;
1878
1879                rcu_read_lock();
1880                if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1881                        src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1882                else
1883                        src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1884                                        RT_SCOPE_UNIVERSE);
1885                rcu_read_unlock();
1886        }
1887        memcpy(addr, &src, 4);
1888}
1889
1890#ifdef CONFIG_IP_ROUTE_CLASSID
1891static void set_class_tag(struct rtable *rt, u32 tag)
1892{
1893        if (!(rt->dst.tclassid & 0xFFFF))
1894                rt->dst.tclassid |= tag & 0xFFFF;
1895        if (!(rt->dst.tclassid & 0xFFFF0000))
1896                rt->dst.tclassid |= tag & 0xFFFF0000;
1897}
1898#endif
1899
1900static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1901{
1902        unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1903
1904        if (advmss == 0) {
1905                advmss = max_t(unsigned int, dst->dev->mtu - 40,
1906                               ip_rt_min_advmss);
1907                if (advmss > 65535 - 40)
1908                        advmss = 65535 - 40;
1909        }
1910        return advmss;
1911}
1912
1913static unsigned int ipv4_mtu(const struct dst_entry *dst)
1914{
1915        const struct rtable *rt = (const struct rtable *) dst;
1916        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1917
1918        if (mtu && rt_is_output_route(rt))
1919                return mtu;
1920
1921        mtu = dst->dev->mtu;
1922
1923        if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1924
1925                if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1926                        mtu = 576;
1927        }
1928
1929        if (mtu > IP_MAX_MTU)
1930                mtu = IP_MAX_MTU;
1931
1932        return mtu;
1933}
1934
1935static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1936                            struct fib_info *fi)
1937{
1938        struct inet_peer *peer;
1939        int create = 0;
1940
1941        /* If a peer entry exists for this destination, we must hook
1942         * it up in order to get at cached metrics.
1943         */
1944        if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1945                create = 1;
1946
1947        rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1948        if (peer) {
1949                rt->rt_peer_genid = rt_peer_genid();
1950                if (inet_metrics_new(peer))
1951                        memcpy(peer->metrics, fi->fib_metrics,
1952                               sizeof(u32) * RTAX_MAX);
1953                dst_init_metrics(&rt->dst, peer->metrics, false);
1954
1955                check_peer_pmtu(&rt->dst, peer);
1956
1957                if (peer->redirect_learned.a4 &&
1958                    peer->redirect_learned.a4 != rt->rt_gateway) {
1959                        rt->rt_gateway = peer->redirect_learned.a4;
1960                        rt->rt_flags |= RTCF_REDIRECTED;
1961                }
1962        } else {
1963                if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1964                        rt->fi = fi;
1965                        atomic_inc(&fi->fib_clntref);
1966                }
1967                dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1968        }
1969}
1970
1971static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1972                           const struct fib_result *res,
1973                           struct fib_info *fi, u16 type, u32 itag)
1974{
1975        struct dst_entry *dst = &rt->dst;
1976
1977        if (fi) {
1978                if (FIB_RES_GW(*res) &&
1979                    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1980                        rt->rt_gateway = FIB_RES_GW(*res);
1981                rt_init_metrics(rt, fl4, fi);
1982#ifdef CONFIG_IP_ROUTE_CLASSID
1983                dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1984#endif
1985        }
1986
1987        if (dst_mtu(dst) > IP_MAX_MTU)
1988                dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1989        if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1990                dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1991
1992#ifdef CONFIG_IP_ROUTE_CLASSID
1993#ifdef CONFIG_IP_MULTIPLE_TABLES
1994        set_class_tag(rt, fib_rules_tclass(res));
1995#endif
1996        set_class_tag(rt, itag);
1997#endif
1998}
1999
2000static struct rtable *rt_dst_alloc(struct net_device *dev,
2001                                   bool nopolicy, bool noxfrm)
2002{
2003        return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2004                         DST_HOST |
2005                         (nopolicy ? DST_NOPOLICY : 0) |
2006                         (noxfrm ? DST_NOXFRM : 0));
2007}
2008
2009/* called in rcu_read_lock() section */
2010static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2011                                u8 tos, struct net_device *dev, int our)
2012{
2013        unsigned int hash;
2014        struct rtable *rth;
2015        __be32 spec_dst;
2016        struct in_device *in_dev = __in_dev_get_rcu(dev);
2017        u32 itag = 0;
2018        int err;
2019
2020        /* Primary sanity checks. */
2021
2022        if (in_dev == NULL)
2023                return -EINVAL;
2024
2025        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2026            ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2027                goto e_inval;
2028
2029        if (ipv4_is_zeronet(saddr)) {
2030                if (!ipv4_is_local_multicast(daddr))
2031                        goto e_inval;
2032                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2033        } else {
2034                err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2035                                          &itag);
2036                if (err < 0)
2037                        goto e_err;
2038        }
2039        rth = rt_dst_alloc(init_net.loopback_dev,
2040                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2041        if (!rth)
2042                goto e_nobufs;
2043
2044#ifdef CONFIG_IP_ROUTE_CLASSID
2045        rth->dst.tclassid = itag;
2046#endif
2047        rth->dst.output = ip_rt_bug;
2048
2049        rth->rt_key_dst = daddr;
2050        rth->rt_key_src = saddr;
2051        rth->rt_genid   = rt_genid(dev_net(dev));
2052        rth->rt_flags   = RTCF_MULTICAST;
2053        rth->rt_type    = RTN_MULTICAST;
2054        rth->rt_key_tos = tos;
2055        rth->rt_dst     = daddr;
2056        rth->rt_src     = saddr;
2057        rth->rt_route_iif = dev->ifindex;
2058        rth->rt_iif     = dev->ifindex;
2059        rth->rt_oif     = 0;
2060        rth->rt_mark    = skb->mark;
2061        rth->rt_gateway = daddr;
2062        rth->rt_spec_dst= spec_dst;
2063        rth->rt_peer_genid = 0;
2064        rth->peer = NULL;
2065        rth->fi = NULL;
2066        if (our) {
2067                rth->dst.input= ip_local_deliver;
2068                rth->rt_flags |= RTCF_LOCAL;
2069        }
2070
2071#ifdef CONFIG_IP_MROUTE
2072        if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2073                rth->dst.input = ip_mr_input;
2074#endif
2075        RT_CACHE_STAT_INC(in_slow_mc);
2076
2077        hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2078        rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2079        return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2080
2081e_nobufs:
2082        return -ENOBUFS;
2083e_inval:
2084        return -EINVAL;
2085e_err:
2086        return err;
2087}
2088
2089
2090static void ip_handle_martian_source(struct net_device *dev,
2091                                     struct in_device *in_dev,
2092                                     struct sk_buff *skb,
2093                                     __be32 daddr,
2094                                     __be32 saddr)
2095{
2096        RT_CACHE_STAT_INC(in_martian_src);
2097#ifdef CONFIG_IP_ROUTE_VERBOSE
2098        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2099                /*
2100                 *      RFC1812 recommendation, if source is martian,
2101                 *      the only hint is MAC header.
2102                 */
2103                printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2104                        &daddr, &saddr, dev->name);
2105                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2106                        int i;
2107                        const unsigned char *p = skb_mac_header(skb);
2108                        printk(KERN_WARNING "ll header: ");
2109                        for (i = 0; i < dev->hard_header_len; i++, p++) {
2110                                printk("%02x", *p);
2111                                if (i < (dev->hard_header_len - 1))
2112                                        printk(":");
2113                        }
2114                        printk("\n");
2115                }
2116        }
2117#endif
2118}
2119
2120/* called in rcu_read_lock() section */
2121static int __mkroute_input(struct sk_buff *skb,
2122                           const struct fib_result *res,
2123                           struct in_device *in_dev,
2124                           __be32 daddr, __be32 saddr, u32 tos,
2125                           struct rtable **result)
2126{
2127        struct rtable *rth;
2128        int err;
2129        struct in_device *out_dev;
2130        unsigned int flags = 0;
2131        __be32 spec_dst;
2132        u32 itag;
2133
2134        /* get a working reference to the output device */
2135        out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2136        if (out_dev == NULL) {
2137                if (net_ratelimit())
2138                        printk(KERN_CRIT "Bug in ip_route_input" \
2139                               "_slow(). Please, report\n");
2140                return -EINVAL;
2141        }
2142
2143
2144        err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2145                                  in_dev->dev, &spec_dst, &itag);
2146        if (err < 0) {
2147                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2148                                         saddr);
2149
2150                goto cleanup;
2151        }
2152
2153        if (err)
2154                flags |= RTCF_DIRECTSRC;
2155
2156        if (out_dev == in_dev && err &&
2157            (IN_DEV_SHARED_MEDIA(out_dev) ||
2158             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2159                flags |= RTCF_DOREDIRECT;
2160
2161        if (skb->protocol != htons(ETH_P_IP)) {
2162                /* Not IP (i.e. ARP). Do not create route, if it is
2163                 * invalid for proxy arp. DNAT routes are always valid.
2164                 *
2165                 * Proxy arp feature have been extended to allow, ARP
2166                 * replies back to the same interface, to support
2167                 * Private VLAN switch technologies. See arp.c.
2168                 */
2169                if (out_dev == in_dev &&
2170                    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2171                        err = -EINVAL;
2172                        goto cleanup;
2173                }
2174        }
2175
2176        rth = rt_dst_alloc(out_dev->dev,
2177                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
2178                           IN_DEV_CONF_GET(out_dev, NOXFRM));
2179        if (!rth) {
2180                err = -ENOBUFS;
2181                goto cleanup;
2182        }
2183
2184        rth->rt_key_dst = daddr;
2185        rth->rt_key_src = saddr;
2186        rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2187        rth->rt_flags = flags;
2188        rth->rt_type = res->type;
2189        rth->rt_key_tos = tos;
2190        rth->rt_dst     = daddr;
2191        rth->rt_src     = saddr;
2192        rth->rt_route_iif = in_dev->dev->ifindex;
2193        rth->rt_iif     = in_dev->dev->ifindex;
2194        rth->rt_oif     = 0;
2195        rth->rt_mark    = skb->mark;
2196        rth->rt_gateway = daddr;
2197        rth->rt_spec_dst= spec_dst;
2198        rth->rt_peer_genid = 0;
2199        rth->peer = NULL;
2200        rth->fi = NULL;
2201
2202        rth->dst.input = ip_forward;
2203        rth->dst.output = ip_output;
2204
2205        rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2206
2207        *result = rth;
2208        err = 0;
2209 cleanup:
2210        return err;
2211}
2212
2213static int ip_mkroute_input(struct sk_buff *skb,
2214                            struct fib_result *res,
2215                            const struct flowi4 *fl4,
2216                            struct in_device *in_dev,
2217                            __be32 daddr, __be32 saddr, u32 tos)
2218{
2219        struct rtable* rth = NULL;
2220        int err;
2221        unsigned hash;
2222
2223#ifdef CONFIG_IP_ROUTE_MULTIPATH
2224        if (res->fi && res->fi->fib_nhs > 1)
2225                fib_select_multipath(res);
2226#endif
2227
2228        /* create a routing cache entry */
2229        err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2230        if (err)
2231                return err;
2232
2233        /* put it into the cache */
2234        hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2235                       rt_genid(dev_net(rth->dst.dev)));
2236        rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2237        if (IS_ERR(rth))
2238                return PTR_ERR(rth);
2239        return 0;
2240}
2241
2242/*
2243 *      NOTE. We drop all the packets that has local source
2244 *      addresses, because every properly looped back packet
2245 *      must have correct destination already attached by output routine.
2246 *
2247 *      Such approach solves two big problems:
2248 *      1. Not simplex devices are handled properly.
2249 *      2. IP spoofing attempts are filtered with 100% of guarantee.
2250 *      called with rcu_read_lock()
2251 */
2252
2253static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2254                               u8 tos, struct net_device *dev)
2255{
2256        struct fib_result res;
2257        struct in_device *in_dev = __in_dev_get_rcu(dev);
2258        struct flowi4   fl4;
2259        unsigned        flags = 0;
2260        u32             itag = 0;
2261        struct rtable * rth;
2262        unsigned        hash;
2263        __be32          spec_dst;
2264        int             err = -EINVAL;
2265        struct net    * net = dev_net(dev);
2266
2267        /* IP on this device is disabled. */
2268
2269        if (!in_dev)
2270                goto out;
2271
2272        /* Check for the most weird martians, which can be not detected
2273           by fib_lookup.
2274         */
2275
2276        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2277            ipv4_is_loopback(saddr))
2278                goto martian_source;
2279
2280        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2281                goto brd_input;
2282
2283        /* Accept zero addresses only to limited broadcast;
2284         * I even do not know to fix it or not. Waiting for complains :-)
2285         */
2286        if (ipv4_is_zeronet(saddr))
2287                goto martian_source;
2288
2289        if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2290                goto martian_destination;
2291
2292        /*
2293         *      Now we are ready to route packet.
2294         */
2295        fl4.flowi4_oif = 0;
2296        fl4.flowi4_iif = dev->ifindex;
2297        fl4.flowi4_mark = skb->mark;
2298        fl4.flowi4_tos = tos;
2299        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2300        fl4.daddr = daddr;
2301        fl4.saddr = saddr;
2302        err = fib_lookup(net, &fl4, &res);
2303        if (err != 0) {
2304                if (!IN_DEV_FORWARD(in_dev))
2305                        goto e_hostunreach;
2306                goto no_route;
2307        }
2308
2309        RT_CACHE_STAT_INC(in_slow_tot);
2310
2311        if (res.type == RTN_BROADCAST)
2312                goto brd_input;
2313
2314        if (res.type == RTN_LOCAL) {
2315                err = fib_validate_source(skb, saddr, daddr, tos,
2316                                          net->loopback_dev->ifindex,
2317                                          dev, &spec_dst, &itag);
2318                if (err < 0)
2319                        goto martian_source_keep_err;
2320                if (err)
2321                        flags |= RTCF_DIRECTSRC;
2322                spec_dst = daddr;
2323                goto local_input;
2324        }
2325
2326        if (!IN_DEV_FORWARD(in_dev))
2327                goto e_hostunreach;
2328        if (res.type != RTN_UNICAST)
2329                goto martian_destination;
2330
2331        err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2332out:    return err;
2333
2334brd_input:
2335        if (skb->protocol != htons(ETH_P_IP))
2336                goto e_inval;
2337
2338        if (ipv4_is_zeronet(saddr))
2339                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2340        else {
2341                err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2342                                          &itag);
2343                if (err < 0)
2344                        goto martian_source_keep_err;
2345                if (err)
2346                        flags |= RTCF_DIRECTSRC;
2347        }
2348        flags |= RTCF_BROADCAST;
2349        res.type = RTN_BROADCAST;
2350        RT_CACHE_STAT_INC(in_brd);
2351
2352local_input:
2353        rth = rt_dst_alloc(net->loopback_dev,
2354                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2355        if (!rth)
2356                goto e_nobufs;
2357
2358        rth->dst.input= ip_local_deliver;
2359        rth->dst.output= ip_rt_bug;
2360#ifdef CONFIG_IP_ROUTE_CLASSID
2361        rth->dst.tclassid = itag;
2362#endif
2363
2364        rth->rt_key_dst = daddr;
2365        rth->rt_key_src = saddr;
2366        rth->rt_genid = rt_genid(net);
2367        rth->rt_flags   = flags|RTCF_LOCAL;
2368        rth->rt_type    = res.type;
2369        rth->rt_key_tos = tos;
2370        rth->rt_dst     = daddr;
2371        rth->rt_src     = saddr;
2372#ifdef CONFIG_IP_ROUTE_CLASSID
2373        rth->dst.tclassid = itag;
2374#endif
2375        rth->rt_route_iif = dev->ifindex;
2376        rth->rt_iif     = dev->ifindex;
2377        rth->rt_oif     = 0;
2378        rth->rt_mark    = skb->mark;
2379        rth->rt_gateway = daddr;
2380        rth->rt_spec_dst= spec_dst;
2381        rth->rt_peer_genid = 0;
2382        rth->peer = NULL;
2383        rth->fi = NULL;
2384        if (res.type == RTN_UNREACHABLE) {
2385                rth->dst.input= ip_error;
2386                rth->dst.error= -err;
2387                rth->rt_flags   &= ~RTCF_LOCAL;
2388        }
2389        hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2390        rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2391        err = 0;
2392        if (IS_ERR(rth))
2393                err = PTR_ERR(rth);
2394        goto out;
2395
2396no_route:
2397        RT_CACHE_STAT_INC(in_no_route);
2398        spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2399        res.type = RTN_UNREACHABLE;
2400        if (err == -ESRCH)
2401                err = -ENETUNREACH;
2402        goto local_input;
2403
2404        /*
2405         *      Do not cache martian addresses: they should be logged (RFC1812)
2406         */
2407martian_destination:
2408        RT_CACHE_STAT_INC(in_martian_dst);
2409#ifdef CONFIG_IP_ROUTE_VERBOSE
2410        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2411                printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2412                        &daddr, &saddr, dev->name);
2413#endif
2414
2415e_hostunreach:
2416        err = -EHOSTUNREACH;
2417        goto out;
2418
2419e_inval:
2420        err = -EINVAL;
2421        goto out;
2422
2423e_nobufs:
2424        err = -ENOBUFS;
2425        goto out;
2426
2427martian_source:
2428        err = -EINVAL;
2429martian_source_keep_err:
2430        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2431        goto out;
2432}
2433
2434int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2435                           u8 tos, struct net_device *dev, bool noref)
2436{
2437        struct rtable * rth;
2438        unsigned        hash;
2439        int iif = dev->ifindex;
2440        struct net *net;
2441        int res;
2442
2443        net = dev_net(dev);
2444
2445        rcu_read_lock();
2446
2447        if (!rt_caching(net))
2448                goto skip_cache;
2449
2450        tos &= IPTOS_RT_MASK;
2451        hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2452
2453        for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2454             rth = rcu_dereference(rth->dst.rt_next)) {
2455                if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2456                     ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2457                     (rth->rt_route_iif ^ iif) |
2458                     (rth->rt_key_tos ^ tos)) == 0 &&
2459                    rth->rt_mark == skb->mark &&
2460                    net_eq(dev_net(rth->dst.dev), net) &&
2461                    !rt_is_expired(rth)) {
2462                        ipv4_validate_peer(rth);
2463                        if (noref) {
2464                                dst_use_noref(&rth->dst, jiffies);
2465                                skb_dst_set_noref(skb, &rth->dst);
2466                        } else {
2467                                dst_use(&rth->dst, jiffies);
2468                                skb_dst_set(skb, &rth->dst);
2469                        }
2470                        RT_CACHE_STAT_INC(in_hit);
2471                        rcu_read_unlock();
2472                        return 0;
2473                }
2474                RT_CACHE_STAT_INC(in_hlist_search);
2475        }
2476
2477skip_cache:
2478        /* Multicast recognition logic is moved from route cache to here.
2479           The problem was that too many Ethernet cards have broken/missing
2480           hardware multicast filters :-( As result the host on multicasting
2481           network acquires a lot of useless route cache entries, sort of
2482           SDR messages from all the world. Now we try to get rid of them.
2483           Really, provided software IP multicast filter is organized
2484           reasonably (at least, hashed), it does not result in a slowdown
2485           comparing with route cache reject entries.
2486           Note, that multicast routers are not affected, because
2487           route cache entry is created eventually.
2488         */
2489        if (ipv4_is_multicast(daddr)) {
2490                struct in_device *in_dev = __in_dev_get_rcu(dev);
2491
2492                if (in_dev) {
2493                        int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2494                                                  ip_hdr(skb)->protocol);
2495                        if (our
2496#ifdef CONFIG_IP_MROUTE
2497                                ||
2498                            (!ipv4_is_local_multicast(daddr) &&
2499                             IN_DEV_MFORWARD(in_dev))
2500#endif
2501                           ) {
2502                                int res = ip_route_input_mc(skb, daddr, saddr,
2503                                                            tos, dev, our);
2504                                rcu_read_unlock();
2505                                return res;
2506                        }
2507                }
2508                rcu_read_unlock();
2509                return -EINVAL;
2510        }
2511        res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2512        rcu_read_unlock();
2513        return res;
2514}
2515EXPORT_SYMBOL(ip_route_input_common);
2516
2517/* called with rcu_read_lock() */
2518static struct rtable *__mkroute_output(const struct fib_result *res,
2519                                       const struct flowi4 *fl4,
2520                                       __be32 orig_daddr, __be32 orig_saddr,
2521                                       int orig_oif, __u8 orig_rtos,
2522                                       struct net_device *dev_out,
2523                                       unsigned int flags)
2524{
2525        struct fib_info *fi = res->fi;
2526        struct in_device *in_dev;
2527        u16 type = res->type;
2528        struct rtable *rth;
2529
2530        if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2531                return ERR_PTR(-EINVAL);
2532
2533        if (ipv4_is_lbcast(fl4->daddr))
2534                type = RTN_BROADCAST;
2535        else if (ipv4_is_multicast(fl4->daddr))
2536                type = RTN_MULTICAST;
2537        else if (ipv4_is_zeronet(fl4->daddr))
2538                return ERR_PTR(-EINVAL);
2539
2540        if (dev_out->flags & IFF_LOOPBACK)
2541                flags |= RTCF_LOCAL;
2542
2543        in_dev = __in_dev_get_rcu(dev_out);
2544        if (!in_dev)
2545                return ERR_PTR(-EINVAL);
2546
2547        if (type == RTN_BROADCAST) {
2548                flags |= RTCF_BROADCAST | RTCF_LOCAL;
2549                fi = NULL;
2550        } else if (type == RTN_MULTICAST) {
2551                flags |= RTCF_MULTICAST | RTCF_LOCAL;
2552                if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2553                                     fl4->flowi4_proto))
2554                        flags &= ~RTCF_LOCAL;
2555                /* If multicast route do not exist use
2556                 * default one, but do not gateway in this case.
2557                 * Yes, it is hack.
2558                 */
2559                if (fi && res->prefixlen < 4)
2560                        fi = NULL;
2561        }
2562
2563        rth = rt_dst_alloc(dev_out,
2564                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
2565                           IN_DEV_CONF_GET(in_dev, NOXFRM));
2566        if (!rth)
2567                return ERR_PTR(-ENOBUFS);
2568
2569        rth->dst.output = ip_output;
2570
2571        rth->rt_key_dst = orig_daddr;
2572        rth->rt_key_src = orig_saddr;
2573        rth->rt_genid = rt_genid(dev_net(dev_out));
2574        rth->rt_flags   = flags;
2575        rth->rt_type    = type;
2576        rth->rt_key_tos = orig_rtos;
2577        rth->rt_dst     = fl4->daddr;
2578        rth->rt_src     = fl4->saddr;
2579        rth->rt_route_iif = 0;
2580        rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2581        rth->rt_oif     = orig_oif;
2582        rth->rt_mark    = fl4->flowi4_mark;
2583        rth->rt_gateway = fl4->daddr;
2584        rth->rt_spec_dst= fl4->saddr;
2585        rth->rt_peer_genid = 0;
2586        rth->peer = NULL;
2587        rth->fi = NULL;
2588
2589        RT_CACHE_STAT_INC(out_slow_tot);
2590
2591        if (flags & RTCF_LOCAL) {
2592                rth->dst.input = ip_local_deliver;
2593                rth->rt_spec_dst = fl4->daddr;
2594        }
2595        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2596                rth->rt_spec_dst = fl4->saddr;
2597                if (flags & RTCF_LOCAL &&
2598                    !(dev_out->flags & IFF_LOOPBACK)) {
2599                        rth->dst.output = ip_mc_output;
2600                        RT_CACHE_STAT_INC(out_slow_mc);
2601                }
2602#ifdef CONFIG_IP_MROUTE
2603                if (type == RTN_MULTICAST) {
2604                        if (IN_DEV_MFORWARD(in_dev) &&
2605                            !ipv4_is_local_multicast(fl4->daddr)) {
2606                                rth->dst.input = ip_mr_input;
2607                                rth->dst.output = ip_mc_output;
2608                        }
2609                }
2610#endif
2611        }
2612
2613        rt_set_nexthop(rth, fl4, res, fi, type, 0);
2614
2615        return rth;
2616}
2617
2618/*
2619 * Major route resolver routine.
2620 * called with rcu_read_lock();
2621 */
2622
2623static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2624{
2625        struct net_device *dev_out = NULL;
2626        __u8 tos = RT_FL_TOS(fl4);
2627        unsigned int flags = 0;
2628        struct fib_result res;
2629        struct rtable *rth;
2630        __be32 orig_daddr;
2631        __be32 orig_saddr;
2632        int orig_oif;
2633
2634        res.fi          = NULL;
2635#ifdef CONFIG_IP_MULTIPLE_TABLES
2636        res.r           = NULL;
2637#endif
2638
2639        orig_daddr = fl4->daddr;
2640        orig_saddr = fl4->saddr;
2641        orig_oif = fl4->flowi4_oif;
2642
2643        fl4->flowi4_iif = net->loopback_dev->ifindex;
2644        fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2645        fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2646                         RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2647
2648        rcu_read_lock();
2649        if (fl4->saddr) {
2650                rth = ERR_PTR(-EINVAL);
2651                if (ipv4_is_multicast(fl4->saddr) ||
2652                    ipv4_is_lbcast(fl4->saddr) ||
2653                    ipv4_is_zeronet(fl4->saddr))
2654                        goto out;
2655
2656                /* I removed check for oif == dev_out->oif here.
2657                   It was wrong for two reasons:
2658                   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2659                      is assigned to multiple interfaces.
2660                   2. Moreover, we are allowed to send packets with saddr
2661                      of another iface. --ANK
2662                 */
2663
2664                if (fl4->flowi4_oif == 0 &&
2665                    (ipv4_is_multicast(fl4->daddr) ||
2666                     ipv4_is_lbcast(fl4->daddr))) {
2667                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2668                        dev_out = __ip_dev_find(net, fl4->saddr, false);
2669                        if (dev_out == NULL)
2670                                goto out;
2671
2672                        /* Special hack: user can direct multicasts
2673                           and limited broadcast via necessary interface
2674                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2675                           This hack is not just for fun, it allows
2676                           vic,vat and friends to work.
2677                           They bind socket to loopback, set ttl to zero
2678                           and expect that it will work.
2679                           From the viewpoint of routing cache they are broken,
2680                           because we are not allowed to build multicast path
2681                           with loopback source addr (look, routing cache
2682                           cannot know, that ttl is zero, so that packet
2683                           will not leave this host and route is valid).
2684                           Luckily, this hack is good workaround.
2685                         */
2686
2687                        fl4->flowi4_oif = dev_out->ifindex;
2688                        goto make_route;
2689                }
2690
2691                if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2692                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2693                        if (!__ip_dev_find(net, fl4->saddr, false))
2694                                goto out;
2695                }
2696        }
2697
2698
2699        if (fl4->flowi4_oif) {
2700                dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2701                rth = ERR_PTR(-ENODEV);
2702                if (dev_out == NULL)
2703                        goto out;
2704
2705                /* RACE: Check return value of inet_select_addr instead. */
2706                if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2707                        rth = ERR_PTR(-ENETUNREACH);
2708                        goto out;
2709                }
2710                if (ipv4_is_local_multicast(fl4->daddr) ||
2711                    ipv4_is_lbcast(fl4->daddr)) {
2712                        if (!fl4->saddr)
2713                                fl4->saddr = inet_select_addr(dev_out, 0,
2714                                                              RT_SCOPE_LINK);
2715                        goto make_route;
2716                }
2717                if (fl4->saddr) {
2718                        if (ipv4_is_multicast(fl4->daddr))
2719                                fl4->saddr = inet_select_addr(dev_out, 0,
2720                                                              fl4->flowi4_scope);
2721                        else if (!fl4->daddr)
2722                                fl4->saddr = inet_select_addr(dev_out, 0,
2723                                                              RT_SCOPE_HOST);
2724                }
2725        }
2726
2727        if (!fl4->daddr) {
2728                fl4->daddr = fl4->saddr;
2729                if (!fl4->daddr)
2730                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2731                dev_out = net->loopback_dev;
2732                fl4->flowi4_oif = net->loopback_dev->ifindex;
2733                res.type = RTN_LOCAL;
2734                flags |= RTCF_LOCAL;
2735                goto make_route;
2736        }
2737
2738        if (fib_lookup(net, fl4, &res)) {
2739                res.fi = NULL;
2740                if (fl4->flowi4_oif) {
2741                        /* Apparently, routing tables are wrong. Assume,
2742                           that the destination is on link.
2743
2744                           WHY? DW.
2745                           Because we are allowed to send to iface
2746                           even if it has NO routes and NO assigned
2747                           addresses. When oif is specified, routing
2748                           tables are looked up with only one purpose:
2749                           to catch if destination is gatewayed, rather than
2750                           direct. Moreover, if MSG_DONTROUTE is set,
2751                           we send packet, ignoring both routing tables
2752                           and ifaddr state. --ANK
2753
2754
2755                           We could make it even if oif is unknown,
2756                           likely IPv6, but we do not.
2757                         */
2758
2759                        if (fl4->saddr == 0)
2760                                fl4->saddr = inet_select_addr(dev_out, 0,
2761                                                              RT_SCOPE_LINK);
2762                        res.type = RTN_UNICAST;
2763                        goto make_route;
2764                }
2765                rth = ERR_PTR(-ENETUNREACH);
2766                goto out;
2767        }
2768
2769        if (res.type == RTN_LOCAL) {
2770                if (!fl4->saddr) {
2771                        if (res.fi->fib_prefsrc)
2772                                fl4->saddr = res.fi->fib_prefsrc;
2773                        else
2774                                fl4->saddr = fl4->daddr;
2775                }
2776                dev_out = net->loopback_dev;
2777                fl4->flowi4_oif = dev_out->ifindex;
2778                res.fi = NULL;
2779                flags |= RTCF_LOCAL;
2780                goto make_route;
2781        }
2782
2783#ifdef CONFIG_IP_ROUTE_MULTIPATH
2784        if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2785                fib_select_multipath(&res);
2786        else
2787#endif
2788        if (!res.prefixlen &&
2789            res.table->tb_num_default > 1 &&
2790            res.type == RTN_UNICAST && !fl4->flowi4_oif)
2791                fib_select_default(&res);
2792
2793        if (!fl4->saddr)
2794                fl4->saddr = FIB_RES_PREFSRC(net, res);
2795
2796        dev_out = FIB_RES_DEV(res);
2797        fl4->flowi4_oif = dev_out->ifindex;
2798
2799
2800make_route:
2801        rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2802                               tos, dev_out, flags);
2803        if (!IS_ERR(rth)) {
2804                unsigned int hash;
2805
2806                hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2807                               rt_genid(dev_net(dev_out)));
2808                rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2809        }
2810
2811out:
2812        rcu_read_unlock();
2813        return rth;
2814}
2815
2816struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2817{
2818        struct rtable *rth;
2819        unsigned int hash;
2820
2821        if (!rt_caching(net))
2822                goto slow_output;
2823
2824        hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2825
2826        rcu_read_lock_bh();
2827        for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2828                rth = rcu_dereference_bh(rth->dst.rt_next)) {
2829                if (rth->rt_key_dst == flp4->daddr &&
2830                    rth->rt_key_src == flp4->saddr &&
2831                    rt_is_output_route(rth) &&
2832                    rth->rt_oif == flp4->flowi4_oif &&
2833                    rth->rt_mark == flp4->flowi4_mark &&
2834                    !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2835                            (IPTOS_RT_MASK | RTO_ONLINK)) &&
2836                    net_eq(dev_net(rth->dst.dev), net) &&
2837                    !rt_is_expired(rth)) {
2838                        ipv4_validate_peer(rth);
2839                        dst_use(&rth->dst, jiffies);
2840                        RT_CACHE_STAT_INC(out_hit);
2841                        rcu_read_unlock_bh();
2842                        if (!flp4->saddr)
2843                                flp4->saddr = rth->rt_src;
2844                        if (!flp4->daddr)
2845                                flp4->daddr = rth->rt_dst;
2846                        return rth;
2847                }
2848                RT_CACHE_STAT_INC(out_hlist_search);
2849        }
2850        rcu_read_unlock_bh();
2851
2852slow_output:
2853        return ip_route_output_slow(net, flp4);
2854}
2855EXPORT_SYMBOL_GPL(__ip_route_output_key);
2856
2857static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2858{
2859        return NULL;
2860}
2861
2862static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2863{
2864        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2865
2866        return mtu ? : dst->dev->mtu;
2867}
2868
2869static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2870{
2871}
2872
2873static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2874                                          unsigned long old)
2875{
2876        return NULL;
2877}
2878
2879static struct dst_ops ipv4_dst_blackhole_ops = {
2880        .family                 =       AF_INET,
2881        .protocol               =       cpu_to_be16(ETH_P_IP),
2882        .destroy                =       ipv4_dst_destroy,
2883        .check                  =       ipv4_blackhole_dst_check,
2884        .mtu                    =       ipv4_blackhole_mtu,
2885        .default_advmss         =       ipv4_default_advmss,
2886        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2887        .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2888        .neigh_lookup           =       ipv4_neigh_lookup,
2889};
2890
2891struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2892{
2893        struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2894        struct rtable *ort = (struct rtable *) dst_orig;
2895
2896        if (rt) {
2897                struct dst_entry *new = &rt->dst;
2898
2899                new->__use = 1;
2900                new->input = dst_discard;
2901                new->output = dst_discard;
2902                dst_copy_metrics(new, &ort->dst);
2903
2904                new->dev = ort->dst.dev;
2905                if (new->dev)
2906                        dev_hold(new->dev);
2907
2908                rt->rt_key_dst = ort->rt_key_dst;
2909                rt->rt_key_src = ort->rt_key_src;
2910                rt->rt_key_tos = ort->rt_key_tos;
2911                rt->rt_route_iif = ort->rt_route_iif;
2912                rt->rt_iif = ort->rt_iif;
2913                rt->rt_oif = ort->rt_oif;
2914                rt->rt_mark = ort->rt_mark;
2915
2916                rt->rt_genid = rt_genid(net);
2917                rt->rt_flags = ort->rt_flags;
2918                rt->rt_type = ort->rt_type;
2919                rt->rt_dst = ort->rt_dst;
2920                rt->rt_src = ort->rt_src;
2921                rt->rt_gateway = ort->rt_gateway;
2922                rt->rt_spec_dst = ort->rt_spec_dst;
2923                rt->peer = ort->peer;
2924                if (rt->peer)
2925                        atomic_inc(&rt->peer->refcnt);
2926                rt->fi = ort->fi;
2927                if (rt->fi)
2928                        atomic_inc(&rt->fi->fib_clntref);
2929
2930                dst_free(new);
2931        }
2932
2933        dst_release(dst_orig);
2934
2935        return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2936}
2937
2938struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2939                                    struct sock *sk)
2940{
2941        struct rtable *rt = __ip_route_output_key(net, flp4);
2942
2943        if (IS_ERR(rt))
2944                return rt;
2945
2946        if (flp4->flowi4_proto)
2947                rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2948                                                   flowi4_to_flowi(flp4),
2949                                                   sk, 0);
2950
2951        return rt;
2952}
2953EXPORT_SYMBOL_GPL(ip_route_output_flow);
2954
2955static int rt_fill_info(struct net *net,
2956                        struct sk_buff *skb, u32 pid, u32 seq, int event,
2957                        int nowait, unsigned int flags)
2958{
2959        struct rtable *rt = skb_rtable(skb);
2960        struct rtmsg *r;
2961        struct nlmsghdr *nlh;
2962        unsigned long expires = 0;
2963        const struct inet_peer *peer = rt->peer;
2964        u32 id = 0, ts = 0, tsage = 0, error;
2965
2966        nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2967        if (nlh == NULL)
2968                return -EMSGSIZE;
2969
2970        r = nlmsg_data(nlh);
2971        r->rtm_family    = AF_INET;
2972        r->rtm_dst_len  = 32;
2973        r->rtm_src_len  = 0;
2974        r->rtm_tos      = rt->rt_key_tos;
2975        r->rtm_table    = RT_TABLE_MAIN;
2976        NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2977        r->rtm_type     = rt->rt_type;
2978        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2979        r->rtm_protocol = RTPROT_UNSPEC;
2980        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2981        if (rt->rt_flags & RTCF_NOTIFY)
2982                r->rtm_flags |= RTM_F_NOTIFY;
2983
2984        NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2985
2986        if (rt->rt_key_src) {
2987                r->rtm_src_len = 32;
2988                NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2989        }
2990        if (rt->dst.dev)
2991                NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2992#ifdef CONFIG_IP_ROUTE_CLASSID
2993        if (rt->dst.tclassid)
2994                NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2995#endif
2996        if (rt_is_input_route(rt))
2997                NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2998        else if (rt->rt_src != rt->rt_key_src)
2999                NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
3000
3001        if (rt->rt_dst != rt->rt_gateway)
3002                NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
3003
3004        if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3005                goto nla_put_failure;
3006
3007        if (rt->rt_mark)
3008                NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
3009
3010        error = rt->dst.error;
3011        if (peer) {
3012                inet_peer_refcheck(rt->peer);
3013                id = atomic_read(&peer->ip_id_count) & 0xffff;
3014                if (peer->tcp_ts_stamp) {
3015                        ts = peer->tcp_ts;
3016                        tsage = get_seconds() - peer->tcp_ts_stamp;
3017                }
3018                expires = ACCESS_ONCE(peer->pmtu_expires);
3019                if (expires) {
3020                        if (time_before(jiffies, expires))
3021                                expires -= jiffies;
3022                        else
3023                                expires = 0;
3024                }
3025        }
3026
3027        if (rt_is_input_route(rt)) {
3028#ifdef CONFIG_IP_MROUTE
3029                __be32 dst = rt->rt_dst;
3030
3031                if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3032                    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3033                        int err = ipmr_get_route(net, skb,
3034                                                 rt->rt_src, rt->rt_dst,
3035                                                 r, nowait);
3036                        if (err <= 0) {
3037                                if (!nowait) {
3038                                        if (err == 0)
3039                                                return 0;
3040                                        goto nla_put_failure;
3041                                } else {
3042                                        if (err == -EMSGSIZE)
3043                                                goto nla_put_failure;
3044                                        error = err;
3045                                }
3046                        }
3047                } else
3048#endif
3049                        NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
3050        }
3051
3052        if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3053                               expires, error) < 0)
3054                goto nla_put_failure;
3055
3056        return nlmsg_end(skb, nlh);
3057
3058nla_put_failure:
3059        nlmsg_cancel(skb, nlh);
3060        return -EMSGSIZE;
3061}
3062
3063static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
3064{
3065        struct net *net = sock_net(in_skb->sk);
3066        struct rtmsg *rtm;
3067        struct nlattr *tb[RTA_MAX+1];
3068        struct rtable *rt = NULL;
3069        __be32 dst = 0;
3070        __be32 src = 0;
3071        u32 iif;
3072        int err;
3073        int mark;
3074        struct sk_buff *skb;
3075
3076        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3077        if (err < 0)
3078                goto errout;
3079
3080        rtm = nlmsg_data(nlh);
3081
3082        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3083        if (skb == NULL) {
3084                err = -ENOBUFS;
3085                goto errout;
3086        }
3087
3088        /* Reserve room for dummy headers, this skb can pass
3089           through good chunk of routing engine.
3090         */
3091        skb_reset_mac_header(skb);
3092        skb_reset_network_header(skb);
3093
3094        /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3095        ip_hdr(skb)->protocol = IPPROTO_ICMP;
3096        skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3097
3098        src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3099        dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3100        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3101        mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3102
3103        if (iif) {
3104                struct net_device *dev;
3105
3106                dev = __dev_get_by_index(net, iif);
3107                if (dev == NULL) {
3108                        err = -ENODEV;
3109                        goto errout_free;
3110                }
3111
3112                skb->protocol   = htons(ETH_P_IP);
3113                skb->dev        = dev;
3114                skb->mark       = mark;
3115                local_bh_disable();
3116                err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3117                local_bh_enable();
3118
3119                rt = skb_rtable(skb);
3120                if (err == 0 && rt->dst.error)
3121                        err = -rt->dst.error;
3122        } else {
3123                struct flowi4 fl4 = {
3124                        .daddr = dst,
3125                        .saddr = src,
3126                        .flowi4_tos = rtm->rtm_tos,
3127                        .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3128                        .flowi4_mark = mark,
3129                };
3130                rt = ip_route_output_key(net, &fl4);
3131
3132                err = 0;
3133                if (IS_ERR(rt))
3134                        err = PTR_ERR(rt);
3135        }
3136
3137        if (err)
3138                goto errout_free;
3139
3140        skb_dst_set(skb, &rt->dst);
3141        if (rtm->rtm_flags & RTM_F_NOTIFY)
3142                rt->rt_flags |= RTCF_NOTIFY;
3143
3144        err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3145                           RTM_NEWROUTE, 0, 0);
3146        if (err <= 0)
3147                goto errout_free;
3148
3149        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3150errout:
3151        return err;
3152
3153errout_free:
3154        kfree_skb(skb);
3155        goto errout;
3156}
3157
3158int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3159{
3160        struct rtable *rt;
3161        int h, s_h;
3162        int idx, s_idx;
3163        struct net *net;
3164
3165        net = sock_net(skb->sk);
3166
3167        s_h = cb->args[0];
3168        if (s_h < 0)
3169                s_h = 0;
3170        s_idx = idx = cb->args[1];
3171        for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3172                if (!rt_hash_table[h].chain)
3173                        continue;
3174                rcu_read_lock_bh();
3175                for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3176                     rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3177                        if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3178                                continue;
3179                        if (rt_is_expired(rt))
3180                                continue;
3181                        skb_dst_set_noref(skb, &rt->dst);
3182                        if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3183                                         cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3184                                         1, NLM_F_MULTI) <= 0) {
3185                                skb_dst_drop(skb);
3186                                rcu_read_unlock_bh();
3187                                goto done;
3188                        }
3189                        skb_dst_drop(skb);
3190                }
3191                rcu_read_unlock_bh();
3192        }
3193
3194done:
3195        cb->args[0] = h;
3196        cb->args[1] = idx;
3197        return skb->len;
3198}
3199
3200void ip_rt_multicast_event(struct in_device *in_dev)
3201{
3202        rt_cache_flush(dev_net(in_dev->dev), 0);
3203}
3204
3205#ifdef CONFIG_SYSCTL
3206static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3207                                        void __user *buffer,
3208                                        size_t *lenp, loff_t *ppos)
3209{
3210        if (write) {
3211                int flush_delay;
3212                ctl_table ctl;
3213                struct net *net;
3214
3215                memcpy(&ctl, __ctl, sizeof(ctl));
3216                ctl.data = &flush_delay;
3217                proc_dointvec(&ctl, write, buffer, lenp, ppos);
3218
3219                net = (struct net *)__ctl->extra1;
3220                rt_cache_flush(net, flush_delay);
3221                return 0;
3222        }
3223
3224        return -EINVAL;
3225}
3226
3227static ctl_table ipv4_route_table[] = {
3228        {
3229                .procname       = "gc_thresh",
3230                .data           = &ipv4_dst_ops.gc_thresh,
3231                .maxlen         = sizeof(int),
3232                .mode           = 0644,
3233                .proc_handler   = proc_dointvec,
3234        },
3235        {
3236                .procname       = "max_size",
3237                .data           = &ip_rt_max_size,
3238                .maxlen         = sizeof(int),
3239                .mode           = 0644,
3240                .proc_handler   = proc_dointvec,
3241        },
3242        {
3243                /*  Deprecated. Use gc_min_interval_ms */
3244
3245                .procname       = "gc_min_interval",
3246                .data           = &ip_rt_gc_min_interval,
3247                .maxlen         = sizeof(int),
3248                .mode           = 0644,
3249                .proc_handler   = proc_dointvec_jiffies,
3250        },
3251        {
3252                .procname       = "gc_min_interval_ms",
3253                .data           = &ip_rt_gc_min_interval,
3254                .maxlen         = sizeof(int),
3255                .mode           = 0644,
3256                .proc_handler   = proc_dointvec_ms_jiffies,
3257        },
3258        {
3259                .procname       = "gc_timeout",
3260                .data           = &ip_rt_gc_timeout,
3261                .maxlen         = sizeof(int),
3262                .mode           = 0644,
3263                .proc_handler   = proc_dointvec_jiffies,
3264        },
3265        {
3266                .procname       = "gc_interval",
3267                .data           = &ip_rt_gc_interval,
3268                .maxlen         = sizeof(int),
3269                .mode           = 0644,
3270                .proc_handler   = proc_dointvec_jiffies,
3271        },
3272        {
3273                .procname       = "redirect_load",
3274                .data           = &ip_rt_redirect_load,
3275                .maxlen         = sizeof(int),
3276                .mode           = 0644,
3277                .proc_handler   = proc_dointvec,
3278        },
3279        {
3280                .procname       = "redirect_number",
3281                .data           = &ip_rt_redirect_number,
3282                .maxlen         = sizeof(int),
3283                .mode           = 0644,
3284                .proc_handler   = proc_dointvec,
3285        },
3286        {
3287                .procname       = "redirect_silence",
3288                .data           = &ip_rt_redirect_silence,
3289                .maxlen         = sizeof(int),
3290                .mode           = 0644,
3291                .proc_handler   = proc_dointvec,
3292        },
3293        {
3294                .procname       = "error_cost",
3295                .data           = &ip_rt_error_cost,
3296                .maxlen         = sizeof(int),
3297                .mode           = 0644,
3298                .proc_handler   = proc_dointvec,
3299        },
3300        {
3301                .procname       = "error_burst",
3302                .data           = &ip_rt_error_burst,
3303                .maxlen         = sizeof(int),
3304                .mode           = 0644,
3305                .proc_handler   = proc_dointvec,
3306        },
3307        {
3308                .procname       = "gc_elasticity",
3309                .data           = &ip_rt_gc_elasticity,
3310                .maxlen         = sizeof(int),
3311                .mode           = 0644,
3312                .proc_handler   = proc_dointvec,
3313        },
3314        {
3315                .procname       = "mtu_expires",
3316                .data           = &ip_rt_mtu_expires,
3317                .maxlen         = sizeof(int),
3318                .mode           = 0644,
3319                .proc_handler   = proc_dointvec_jiffies,
3320        },
3321        {
3322                .procname       = "min_pmtu",
3323                .data           = &ip_rt_min_pmtu,
3324                .maxlen         = sizeof(int),
3325                .mode           = 0644,
3326                .proc_handler   = proc_dointvec,
3327        },
3328        {
3329                .procname       = "min_adv_mss",
3330                .data           = &ip_rt_min_advmss,
3331                .maxlen         = sizeof(int),
3332                .mode           = 0644,
3333                .proc_handler   = proc_dointvec,
3334        },
3335        { }
3336};
3337
3338static struct ctl_table empty[1];
3339
3340static struct ctl_table ipv4_skeleton[] =
3341{
3342        { .procname = "route", 
3343          .mode = 0555, .child = ipv4_route_table},
3344        { .procname = "neigh", 
3345          .mode = 0555, .child = empty},
3346        { }
3347};
3348
3349static __net_initdata struct ctl_path ipv4_path[] = {
3350        { .procname = "net", },
3351        { .procname = "ipv4", },
3352        { },
3353};
3354
3355static struct ctl_table ipv4_route_flush_table[] = {
3356        {
3357                .procname       = "flush",
3358                .maxlen         = sizeof(int),
3359                .mode           = 0200,
3360                .proc_handler   = ipv4_sysctl_rtcache_flush,
3361        },
3362        { },
3363};
3364
3365static __net_initdata struct ctl_path ipv4_route_path[] = {
3366        { .procname = "net", },
3367        { .procname = "ipv4", },
3368        { .procname = "route", },
3369        { },
3370};
3371
3372static __net_init int sysctl_route_net_init(struct net *net)
3373{
3374        struct ctl_table *tbl;
3375
3376        tbl = ipv4_route_flush_table;
3377        if (!net_eq(net, &init_net)) {
3378                tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3379                if (tbl == NULL)
3380                        goto err_dup;
3381        }
3382        tbl[0].extra1 = net;
3383
3384        net->ipv4.route_hdr =
3385                register_net_sysctl_table(net, ipv4_route_path, tbl);
3386        if (net->ipv4.route_hdr == NULL)
3387                goto err_reg;
3388        return 0;
3389
3390err_reg:
3391        if (tbl != ipv4_route_flush_table)
3392                kfree(tbl);
3393err_dup:
3394        return -ENOMEM;
3395}
3396
3397static __net_exit void sysctl_route_net_exit(struct net *net)
3398{
3399        struct ctl_table *tbl;
3400
3401        tbl = net->ipv4.route_hdr->ctl_table_arg;
3402        unregister_net_sysctl_table(net->ipv4.route_hdr);
3403        BUG_ON(tbl == ipv4_route_flush_table);
3404        kfree(tbl);
3405}
3406
3407static __net_initdata struct pernet_operations sysctl_route_ops = {
3408        .init = sysctl_route_net_init,
3409        .exit = sysctl_route_net_exit,
3410};
3411#endif
3412
3413static __net_init int rt_genid_init(struct net *net)
3414{
3415        get_random_bytes(&net->ipv4.rt_genid,
3416                         sizeof(net->ipv4.rt_genid));
3417        get_random_bytes(&net->ipv4.dev_addr_genid,
3418                         sizeof(net->ipv4.dev_addr_genid));
3419        return 0;
3420}
3421
3422static __net_initdata struct pernet_operations rt_genid_ops = {
3423        .init = rt_genid_init,
3424};
3425
3426
3427#ifdef CONFIG_IP_ROUTE_CLASSID
3428struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3429#endif /* CONFIG_IP_ROUTE_CLASSID */
3430
3431static __initdata unsigned long rhash_entries;
3432static int __init set_rhash_entries(char *str)
3433{
3434        if (!str)
3435                return 0;
3436        rhash_entries = simple_strtoul(str, &str, 0);
3437        return 1;
3438}
3439__setup("rhash_entries=", set_rhash_entries);
3440
3441int __init ip_rt_init(void)
3442{
3443        int rc = 0;
3444
3445#ifdef CONFIG_IP_ROUTE_CLASSID
3446        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3447        if (!ip_rt_acct)
3448                panic("IP: failed to allocate ip_rt_acct\n");
3449#endif
3450
3451        ipv4_dst_ops.kmem_cachep =
3452                kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3453                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3454
3455        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3456
3457        if (dst_entries_init(&ipv4_dst_ops) < 0)
3458                panic("IP: failed to allocate ipv4_dst_ops counter\n");
3459
3460        if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3461                panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3462
3463        rt_hash_table = (struct rt_hash_bucket *)
3464                alloc_large_system_hash("IP route cache",
3465                                        sizeof(struct rt_hash_bucket),
3466                                        rhash_entries,
3467                                        (totalram_pages >= 128 * 1024) ?
3468                                        15 : 17,
3469                                        0,
3470                                        &rt_hash_log,
3471                                        &rt_hash_mask,
3472                                        rhash_entries ? 0 : 512 * 1024);
3473        memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3474        rt_hash_lock_init();
3475
3476        ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3477        ip_rt_max_size = (rt_hash_mask + 1) * 16;
3478
3479        devinet_init();
3480        ip_fib_init();
3481
3482        INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3483        expires_ljiffies = jiffies;
3484        schedule_delayed_work(&expires_work,
3485                net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3486
3487        if (ip_rt_proc_init())
3488                printk(KERN_ERR "Unable to create route proc files\n");
3489#ifdef CONFIG_XFRM
3490        xfrm_init();
3491        xfrm4_init(ip_rt_max_size);
3492#endif
3493        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3494
3495#ifdef CONFIG_SYSCTL
3496        register_pernet_subsys(&sysctl_route_ops);
3497#endif
3498        register_pernet_subsys(&rt_genid_ops);
3499        return rc;
3500}
3501
3502#ifdef CONFIG_SYSCTL
3503/*
3504 * We really need to sanitize the damn ipv4 init order, then all
3505 * this nonsense will go away.
3506 */
3507void __init ip_static_sysctl_init(void)
3508{
3509        register_sysctl_paths(ipv4_path, ipv4_skeleton);
3510}
3511#endif
3512
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.