linux/net/ipv4/route.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              ROUTE - implementation of the IP router.
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13 *
  14 * Fixes:
  15 *              Alan Cox        :       Verify area fixes.
  16 *              Alan Cox        :       cli() protects routing changes
  17 *              Rui Oliveira    :       ICMP routing table updates
  18 *              (rco@di.uminho.pt)      Routing table insertion and update
  19 *              Linus Torvalds  :       Rewrote bits to be sensible
  20 *              Alan Cox        :       Added BSD route gw semantics
  21 *              Alan Cox        :       Super /proc >4K
  22 *              Alan Cox        :       MTU in route table
  23 *              Alan Cox        :       MSS actually. Also added the window
  24 *                                      clamper.
  25 *              Sam Lantinga    :       Fixed route matching in rt_del()
  26 *              Alan Cox        :       Routing cache support.
  27 *              Alan Cox        :       Removed compatibility cruft.
  28 *              Alan Cox        :       RTF_REJECT support.
  29 *              Alan Cox        :       TCP irtt support.
  30 *              Jonathan Naylor :       Added Metric support.
  31 *      Miquel van Smoorenburg  :       BSD API fixes.
  32 *      Miquel van Smoorenburg  :       Metrics.
  33 *              Alan Cox        :       Use __u32 properly
  34 *              Alan Cox        :       Aligned routing errors more closely with BSD
  35 *                                      our system is still very different.
  36 *              Alan Cox        :       Faster /proc handling
  37 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38 *                                      routing caches and better behaviour.
  39 *
  40 *              Olaf Erb        :       irtt wasn't being copied right.
  41 *              Bjorn Ekwall    :       Kerneld route support.
  42 *              Alan Cox        :       Multicast fixed (I hope)
  43 *              Pavel Krauz     :       Limited broadcast fixed
  44 *              Mike McLagan    :       Routing by source
  45 *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46 *                                      route.c and rewritten from scratch.
  47 *              Andi Kleen      :       Load-limit warning messages.
  48 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52 *              Marc Boucher    :       routing by fwmark
  53 *      Robert Olsson           :       Added rt_cache statistics
  54 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55 *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56 *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57 *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58 *
  59 *              This program is free software; you can redistribute it and/or
  60 *              modify it under the terms of the GNU General Public License
  61 *              as published by the Free Software Foundation; either version
  62 *              2 of the License, or (at your option) any later version.
  63 */
  64
  65#define pr_fmt(fmt) "IPv4: " fmt
  66
  67#include <linux/module.h>
  68#include <asm/uaccess.h>
  69#include <linux/bitops.h>
  70#include <linux/types.h>
  71#include <linux/kernel.h>
  72#include <linux/mm.h>
  73#include <linux/string.h>
  74#include <linux/socket.h>
  75#include <linux/sockios.h>
  76#include <linux/errno.h>
  77#include <linux/in.h>
  78#include <linux/inet.h>
  79#include <linux/netdevice.h>
  80#include <linux/proc_fs.h>
  81#include <linux/init.h>
  82#include <linux/skbuff.h>
  83#include <linux/inetdevice.h>
  84#include <linux/igmp.h>
  85#include <linux/pkt_sched.h>
  86#include <linux/mroute.h>
  87#include <linux/netfilter_ipv4.h>
  88#include <linux/random.h>
  89#include <linux/rcupdate.h>
  90#include <linux/times.h>
  91#include <linux/slab.h>
  92#include <net/dst.h>
  93#include <net/net_namespace.h>
  94#include <net/protocol.h>
  95#include <net/ip.h>
  96#include <net/route.h>
  97#include <net/inetpeer.h>
  98#include <net/sock.h>
  99#include <net/ip_fib.h>
 100#include <net/arp.h>
 101#include <net/tcp.h>
 102#include <net/icmp.h>
 103#include <net/xfrm.h>
 104#include <net/netevent.h>
 105#include <net/rtnetlink.h>
 106#ifdef CONFIG_SYSCTL
 107#include <linux/sysctl.h>
 108#include <linux/kmemleak.h>
 109#endif
 110#include <net/secure_seq.h>
 111
 112#define RT_FL_TOS(oldflp4) \
 113        ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 114
 115#define IP_MAX_MTU      0xFFF0
 116
 117#define RT_GC_TIMEOUT (300*HZ)
 118
 119static int ip_rt_max_size;
 120static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 121static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 122static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 123static int ip_rt_redirect_number __read_mostly  = 9;
 124static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 125static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 126static int ip_rt_error_cost __read_mostly       = HZ;
 127static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 128static int ip_rt_gc_elasticity __read_mostly    = 8;
 129static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 130static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 131static int ip_rt_min_advmss __read_mostly       = 256;
 132
 133/*
 134 *      Interface to generic destination cache.
 135 */
 136
 137static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 138static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 139static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 140static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 141static void              ipv4_link_failure(struct sk_buff *skb);
 142static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 143                                           struct sk_buff *skb, u32 mtu);
 144static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 145                                        struct sk_buff *skb);
 146static void             ipv4_dst_destroy(struct dst_entry *dst);
 147
 148static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 149                            int how)
 150{
 151}
 152
 153static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 154{
 155        WARN_ON(1);
 156        return NULL;
 157}
 158
 159static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 160                                           struct sk_buff *skb,
 161                                           const void *daddr);
 162
 163static struct dst_ops ipv4_dst_ops = {
 164        .family =               AF_INET,
 165        .protocol =             cpu_to_be16(ETH_P_IP),
 166        .check =                ipv4_dst_check,
 167        .default_advmss =       ipv4_default_advmss,
 168        .mtu =                  ipv4_mtu,
 169        .cow_metrics =          ipv4_cow_metrics,
 170        .destroy =              ipv4_dst_destroy,
 171        .ifdown =               ipv4_dst_ifdown,
 172        .negative_advice =      ipv4_negative_advice,
 173        .link_failure =         ipv4_link_failure,
 174        .update_pmtu =          ip_rt_update_pmtu,
 175        .redirect =             ip_do_redirect,
 176        .local_out =            __ip_local_out,
 177        .neigh_lookup =         ipv4_neigh_lookup,
 178};
 179
 180#define ECN_OR_COST(class)      TC_PRIO_##class
 181
 182const __u8 ip_tos2prio[16] = {
 183        TC_PRIO_BESTEFFORT,
 184        ECN_OR_COST(BESTEFFORT),
 185        TC_PRIO_BESTEFFORT,
 186        ECN_OR_COST(BESTEFFORT),
 187        TC_PRIO_BULK,
 188        ECN_OR_COST(BULK),
 189        TC_PRIO_BULK,
 190        ECN_OR_COST(BULK),
 191        TC_PRIO_INTERACTIVE,
 192        ECN_OR_COST(INTERACTIVE),
 193        TC_PRIO_INTERACTIVE,
 194        ECN_OR_COST(INTERACTIVE),
 195        TC_PRIO_INTERACTIVE_BULK,
 196        ECN_OR_COST(INTERACTIVE_BULK),
 197        TC_PRIO_INTERACTIVE_BULK,
 198        ECN_OR_COST(INTERACTIVE_BULK)
 199};
 200EXPORT_SYMBOL(ip_tos2prio);
 201
 202static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 203#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 204
 205#ifdef CONFIG_PROC_FS
 206static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 207{
 208        if (*pos)
 209                return NULL;
 210        return SEQ_START_TOKEN;
 211}
 212
 213static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 214{
 215        ++*pos;
 216        return NULL;
 217}
 218
 219static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 220{
 221}
 222
 223static int rt_cache_seq_show(struct seq_file *seq, void *v)
 224{
 225        if (v == SEQ_START_TOKEN)
 226                seq_printf(seq, "%-127s\n",
 227                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 228                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 229                           "HHUptod\tSpecDst");
 230        return 0;
 231}
 232
 233static const struct seq_operations rt_cache_seq_ops = {
 234        .start  = rt_cache_seq_start,
 235        .next   = rt_cache_seq_next,
 236        .stop   = rt_cache_seq_stop,
 237        .show   = rt_cache_seq_show,
 238};
 239
 240static int rt_cache_seq_open(struct inode *inode, struct file *file)
 241{
 242        return seq_open(file, &rt_cache_seq_ops);
 243}
 244
 245static const struct file_operations rt_cache_seq_fops = {
 246        .owner   = THIS_MODULE,
 247        .open    = rt_cache_seq_open,
 248        .read    = seq_read,
 249        .llseek  = seq_lseek,
 250        .release = seq_release,
 251};
 252
 253
 254static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 255{
 256        int cpu;
 257
 258        if (*pos == 0)
 259                return SEQ_START_TOKEN;
 260
 261        for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 262                if (!cpu_possible(cpu))
 263                        continue;
 264                *pos = cpu+1;
 265                return &per_cpu(rt_cache_stat, cpu);
 266        }
 267        return NULL;
 268}
 269
 270static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 271{
 272        int cpu;
 273
 274        for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 275                if (!cpu_possible(cpu))
 276                        continue;
 277                *pos = cpu+1;
 278                return &per_cpu(rt_cache_stat, cpu);
 279        }
 280        return NULL;
 281
 282}
 283
 284static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 285{
 286
 287}
 288
 289static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 290{
 291        struct rt_cache_stat *st = v;
 292
 293        if (v == SEQ_START_TOKEN) {
 294                seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 295                return 0;
 296        }
 297
 298        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 299                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 300                   dst_entries_get_slow(&ipv4_dst_ops),
 301                   st->in_hit,
 302                   st->in_slow_tot,
 303                   st->in_slow_mc,
 304                   st->in_no_route,
 305                   st->in_brd,
 306                   st->in_martian_dst,
 307                   st->in_martian_src,
 308
 309                   st->out_hit,
 310                   st->out_slow_tot,
 311                   st->out_slow_mc,
 312
 313                   st->gc_total,
 314                   st->gc_ignored,
 315                   st->gc_goal_miss,
 316                   st->gc_dst_overflow,
 317                   st->in_hlist_search,
 318                   st->out_hlist_search
 319                );
 320        return 0;
 321}
 322
 323static const struct seq_operations rt_cpu_seq_ops = {
 324        .start  = rt_cpu_seq_start,
 325        .next   = rt_cpu_seq_next,
 326        .stop   = rt_cpu_seq_stop,
 327        .show   = rt_cpu_seq_show,
 328};
 329
 330
 331static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 332{
 333        return seq_open(file, &rt_cpu_seq_ops);
 334}
 335
 336static const struct file_operations rt_cpu_seq_fops = {
 337        .owner   = THIS_MODULE,
 338        .open    = rt_cpu_seq_open,
 339        .read    = seq_read,
 340        .llseek  = seq_lseek,
 341        .release = seq_release,
 342};
 343
 344#ifdef CONFIG_IP_ROUTE_CLASSID
 345static int rt_acct_proc_show(struct seq_file *m, void *v)
 346{
 347        struct ip_rt_acct *dst, *src;
 348        unsigned int i, j;
 349
 350        dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 351        if (!dst)
 352                return -ENOMEM;
 353
 354        for_each_possible_cpu(i) {
 355                src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 356                for (j = 0; j < 256; j++) {
 357                        dst[j].o_bytes   += src[j].o_bytes;
 358                        dst[j].o_packets += src[j].o_packets;
 359                        dst[j].i_bytes   += src[j].i_bytes;
 360                        dst[j].i_packets += src[j].i_packets;
 361                }
 362        }
 363
 364        seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 365        kfree(dst);
 366        return 0;
 367}
 368
 369static int rt_acct_proc_open(struct inode *inode, struct file *file)
 370{
 371        return single_open(file, rt_acct_proc_show, NULL);
 372}
 373
 374static const struct file_operations rt_acct_proc_fops = {
 375        .owner          = THIS_MODULE,
 376        .open           = rt_acct_proc_open,
 377        .read           = seq_read,
 378        .llseek         = seq_lseek,
 379        .release        = single_release,
 380};
 381#endif
 382
 383static int __net_init ip_rt_do_proc_init(struct net *net)
 384{
 385        struct proc_dir_entry *pde;
 386
 387        pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 388                        &rt_cache_seq_fops);
 389        if (!pde)
 390                goto err1;
 391
 392        pde = proc_create("rt_cache", S_IRUGO,
 393                          net->proc_net_stat, &rt_cpu_seq_fops);
 394        if (!pde)
 395                goto err2;
 396
 397#ifdef CONFIG_IP_ROUTE_CLASSID
 398        pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 399        if (!pde)
 400                goto err3;
 401#endif
 402        return 0;
 403
 404#ifdef CONFIG_IP_ROUTE_CLASSID
 405err3:
 406        remove_proc_entry("rt_cache", net->proc_net_stat);
 407#endif
 408err2:
 409        remove_proc_entry("rt_cache", net->proc_net);
 410err1:
 411        return -ENOMEM;
 412}
 413
 414static void __net_exit ip_rt_do_proc_exit(struct net *net)
 415{
 416        remove_proc_entry("rt_cache", net->proc_net_stat);
 417        remove_proc_entry("rt_cache", net->proc_net);
 418#ifdef CONFIG_IP_ROUTE_CLASSID
 419        remove_proc_entry("rt_acct", net->proc_net);
 420#endif
 421}
 422
 423static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 424        .init = ip_rt_do_proc_init,
 425        .exit = ip_rt_do_proc_exit,
 426};
 427
 428static int __init ip_rt_proc_init(void)
 429{
 430        return register_pernet_subsys(&ip_rt_proc_ops);
 431}
 432
 433#else
 434static inline int ip_rt_proc_init(void)
 435{
 436        return 0;
 437}
 438#endif /* CONFIG_PROC_FS */
 439
 440static inline bool rt_is_expired(const struct rtable *rth)
 441{
 442        return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 443}
 444
 445void rt_cache_flush(struct net *net)
 446{
 447        rt_genid_bump(net);
 448}
 449
 450static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 451                                           struct sk_buff *skb,
 452                                           const void *daddr)
 453{
 454        struct net_device *dev = dst->dev;
 455        const __be32 *pkey = daddr;
 456        const struct rtable *rt;
 457        struct neighbour *n;
 458
 459        rt = (const struct rtable *) dst;
 460        if (rt->rt_gateway)
 461                pkey = (const __be32 *) &rt->rt_gateway;
 462        else if (skb)
 463                pkey = &ip_hdr(skb)->daddr;
 464
 465        n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 466        if (n)
 467                return n;
 468        return neigh_create(&arp_tbl, pkey, dev);
 469}
 470
 471/*
 472 * Peer allocation may fail only in serious out-of-memory conditions.  However
 473 * we still can generate some output.
 474 * Random ID selection looks a bit dangerous because we have no chances to
 475 * select ID being unique in a reasonable period of time.
 476 * But broken packet identifier may be better than no packet at all.
 477 */
 478static void ip_select_fb_ident(struct iphdr *iph)
 479{
 480        static DEFINE_SPINLOCK(ip_fb_id_lock);
 481        static u32 ip_fallback_id;
 482        u32 salt;
 483
 484        spin_lock_bh(&ip_fb_id_lock);
 485        salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
 486        iph->id = htons(salt & 0xFFFF);
 487        ip_fallback_id = salt;
 488        spin_unlock_bh(&ip_fb_id_lock);
 489}
 490
 491void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 492{
 493        struct net *net = dev_net(dst->dev);
 494        struct inet_peer *peer;
 495
 496        peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
 497        if (peer) {
 498                iph->id = htons(inet_getid(peer, more));
 499                inet_putpeer(peer);
 500                return;
 501        }
 502
 503        ip_select_fb_ident(iph);
 504}
 505EXPORT_SYMBOL(__ip_select_ident);
 506
 507static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 508                             const struct iphdr *iph,
 509                             int oif, u8 tos,
 510                             u8 prot, u32 mark, int flow_flags)
 511{
 512        if (sk) {
 513                const struct inet_sock *inet = inet_sk(sk);
 514
 515                oif = sk->sk_bound_dev_if;
 516                mark = sk->sk_mark;
 517                tos = RT_CONN_FLAGS(sk);
 518                prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 519        }
 520        flowi4_init_output(fl4, oif, mark, tos,
 521                           RT_SCOPE_UNIVERSE, prot,
 522                           flow_flags,
 523                           iph->daddr, iph->saddr, 0, 0);
 524}
 525
 526static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 527                               const struct sock *sk)
 528{
 529        const struct iphdr *iph = ip_hdr(skb);
 530        int oif = skb->dev->ifindex;
 531        u8 tos = RT_TOS(iph->tos);
 532        u8 prot = iph->protocol;
 533        u32 mark = skb->mark;
 534
 535        __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 536}
 537
 538static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 539{
 540        const struct inet_sock *inet = inet_sk(sk);
 541        const struct ip_options_rcu *inet_opt;
 542        __be32 daddr = inet->inet_daddr;
 543
 544        rcu_read_lock();
 545        inet_opt = rcu_dereference(inet->inet_opt);
 546        if (inet_opt && inet_opt->opt.srr)
 547                daddr = inet_opt->opt.faddr;
 548        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 549                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 550                           inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 551                           inet_sk_flowi_flags(sk),
 552                           daddr, inet->inet_saddr, 0, 0);
 553        rcu_read_unlock();
 554}
 555
 556static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 557                                 const struct sk_buff *skb)
 558{
 559        if (skb)
 560                build_skb_flow_key(fl4, skb, sk);
 561        else
 562                build_sk_flow_key(fl4, sk);
 563}
 564
 565static inline void rt_free(struct rtable *rt)
 566{
 567        call_rcu(&rt->dst.rcu_head, dst_rcu_free);
 568}
 569
 570static DEFINE_SPINLOCK(fnhe_lock);
 571
 572static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 573{
 574        struct fib_nh_exception *fnhe, *oldest;
 575        struct rtable *orig;
 576
 577        oldest = rcu_dereference(hash->chain);
 578        for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 579             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 580                if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 581                        oldest = fnhe;
 582        }
 583        orig = rcu_dereference(oldest->fnhe_rth);
 584        if (orig) {
 585                RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
 586                rt_free(orig);
 587        }
 588        return oldest;
 589}
 590
 591static inline u32 fnhe_hashfun(__be32 daddr)
 592{
 593        u32 hval;
 594
 595        hval = (__force u32) daddr;
 596        hval ^= (hval >> 11) ^ (hval >> 22);
 597
 598        return hval & (FNHE_HASH_SIZE - 1);
 599}
 600
 601static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 602                                  u32 pmtu, unsigned long expires)
 603{
 604        struct fnhe_hash_bucket *hash;
 605        struct fib_nh_exception *fnhe;
 606        int depth;
 607        u32 hval = fnhe_hashfun(daddr);
 608
 609        spin_lock_bh(&fnhe_lock);
 610
 611        hash = nh->nh_exceptions;
 612        if (!hash) {
 613                hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 614                if (!hash)
 615                        goto out_unlock;
 616                nh->nh_exceptions = hash;
 617        }
 618
 619        hash += hval;
 620
 621        depth = 0;
 622        for (fnhe = rcu_dereference(hash->chain); fnhe;
 623             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 624                if (fnhe->fnhe_daddr == daddr)
 625                        break;
 626                depth++;
 627        }
 628
 629        if (fnhe) {
 630                if (gw)
 631                        fnhe->fnhe_gw = gw;
 632                if (pmtu) {
 633                        fnhe->fnhe_pmtu = pmtu;
 634                        fnhe->fnhe_expires = expires;
 635                }
 636        } else {
 637                if (depth > FNHE_RECLAIM_DEPTH)
 638                        fnhe = fnhe_oldest(hash);
 639                else {
 640                        fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 641                        if (!fnhe)
 642                                goto out_unlock;
 643
 644                        fnhe->fnhe_next = hash->chain;
 645                        rcu_assign_pointer(hash->chain, fnhe);
 646                }
 647                fnhe->fnhe_daddr = daddr;
 648                fnhe->fnhe_gw = gw;
 649                fnhe->fnhe_pmtu = pmtu;
 650                fnhe->fnhe_expires = expires;
 651        }
 652
 653        fnhe->fnhe_stamp = jiffies;
 654
 655out_unlock:
 656        spin_unlock_bh(&fnhe_lock);
 657        return;
 658}
 659
 660static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 661                             bool kill_route)
 662{
 663        __be32 new_gw = icmp_hdr(skb)->un.gateway;
 664        __be32 old_gw = ip_hdr(skb)->saddr;
 665        struct net_device *dev = skb->dev;
 666        struct in_device *in_dev;
 667        struct fib_result res;
 668        struct neighbour *n;
 669        struct net *net;
 670
 671        switch (icmp_hdr(skb)->code & 7) {
 672        case ICMP_REDIR_NET:
 673        case ICMP_REDIR_NETTOS:
 674        case ICMP_REDIR_HOST:
 675        case ICMP_REDIR_HOSTTOS:
 676                break;
 677
 678        default:
 679                return;
 680        }
 681
 682        if (rt->rt_gateway != old_gw)
 683                return;
 684
 685        in_dev = __in_dev_get_rcu(dev);
 686        if (!in_dev)
 687                return;
 688
 689        net = dev_net(dev);
 690        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 691            ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 692            ipv4_is_zeronet(new_gw))
 693                goto reject_redirect;
 694
 695        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 696                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 697                        goto reject_redirect;
 698                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 699                        goto reject_redirect;
 700        } else {
 701                if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 702                        goto reject_redirect;
 703        }
 704
 705        n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
 706        if (n) {
 707                if (!(n->nud_state & NUD_VALID)) {
 708                        neigh_event_send(n, NULL);
 709                } else {
 710                        if (fib_lookup(net, fl4, &res) == 0) {
 711                                struct fib_nh *nh = &FIB_RES_NH(res);
 712
 713                                update_or_create_fnhe(nh, fl4->daddr, new_gw,
 714                                                      0, 0);
 715                        }
 716                        if (kill_route)
 717                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 718                        call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 719                }
 720                neigh_release(n);
 721        }
 722        return;
 723
 724reject_redirect:
 725#ifdef CONFIG_IP_ROUTE_VERBOSE
 726        if (IN_DEV_LOG_MARTIANS(in_dev)) {
 727                const struct iphdr *iph = (const struct iphdr *) skb->data;
 728                __be32 daddr = iph->daddr;
 729                __be32 saddr = iph->saddr;
 730
 731                net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 732                                     "  Advised path = %pI4 -> %pI4\n",
 733                                     &old_gw, dev->name, &new_gw,
 734                                     &saddr, &daddr);
 735        }
 736#endif
 737        ;
 738}
 739
 740static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 741{
 742        struct rtable *rt;
 743        struct flowi4 fl4;
 744
 745        rt = (struct rtable *) dst;
 746
 747        ip_rt_build_flow_key(&fl4, sk, skb);
 748        __ip_do_redirect(rt, skb, &fl4, true);
 749}
 750
 751static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 752{
 753        struct rtable *rt = (struct rtable *)dst;
 754        struct dst_entry *ret = dst;
 755
 756        if (rt) {
 757                if (dst->obsolete > 0) {
 758                        ip_rt_put(rt);
 759                        ret = NULL;
 760                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 761                           rt->dst.expires) {
 762                        ip_rt_put(rt);
 763                        ret = NULL;
 764                }
 765        }
 766        return ret;
 767}
 768
 769/*
 770 * Algorithm:
 771 *      1. The first ip_rt_redirect_number redirects are sent
 772 *         with exponential backoff, then we stop sending them at all,
 773 *         assuming that the host ignores our redirects.
 774 *      2. If we did not see packets requiring redirects
 775 *         during ip_rt_redirect_silence, we assume that the host
 776 *         forgot redirected route and start to send redirects again.
 777 *
 778 * This algorithm is much cheaper and more intelligent than dumb load limiting
 779 * in icmp.c.
 780 *
 781 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 782 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 783 */
 784
 785void ip_rt_send_redirect(struct sk_buff *skb)
 786{
 787        struct rtable *rt = skb_rtable(skb);
 788        struct in_device *in_dev;
 789        struct inet_peer *peer;
 790        struct net *net;
 791        int log_martians;
 792
 793        rcu_read_lock();
 794        in_dev = __in_dev_get_rcu(rt->dst.dev);
 795        if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 796                rcu_read_unlock();
 797                return;
 798        }
 799        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 800        rcu_read_unlock();
 801
 802        net = dev_net(rt->dst.dev);
 803        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 804        if (!peer) {
 805                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 806                          rt_nexthop(rt, ip_hdr(skb)->daddr));
 807                return;
 808        }
 809
 810        /* No redirected packets during ip_rt_redirect_silence;
 811         * reset the algorithm.
 812         */
 813        if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 814                peer->rate_tokens = 0;
 815
 816        /* Too many ignored redirects; do not send anything
 817         * set dst.rate_last to the last seen redirected packet.
 818         */
 819        if (peer->rate_tokens >= ip_rt_redirect_number) {
 820                peer->rate_last = jiffies;
 821                goto out_put_peer;
 822        }
 823
 824        /* Check for load limit; set rate_last to the latest sent
 825         * redirect.
 826         */
 827        if (peer->rate_tokens == 0 ||
 828            time_after(jiffies,
 829                       (peer->rate_last +
 830                        (ip_rt_redirect_load << peer->rate_tokens)))) {
 831                __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 832
 833                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 834                peer->rate_last = jiffies;
 835                ++peer->rate_tokens;
 836#ifdef CONFIG_IP_ROUTE_VERBOSE
 837                if (log_martians &&
 838                    peer->rate_tokens == ip_rt_redirect_number)
 839                        net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 840                                             &ip_hdr(skb)->saddr, inet_iif(skb),
 841                                             &ip_hdr(skb)->daddr, &gw);
 842#endif
 843        }
 844out_put_peer:
 845        inet_putpeer(peer);
 846}
 847
 848static int ip_error(struct sk_buff *skb)
 849{
 850        struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 851        struct rtable *rt = skb_rtable(skb);
 852        struct inet_peer *peer;
 853        unsigned long now;
 854        struct net *net;
 855        bool send;
 856        int code;
 857
 858        net = dev_net(rt->dst.dev);
 859        if (!IN_DEV_FORWARD(in_dev)) {
 860                switch (rt->dst.error) {
 861                case EHOSTUNREACH:
 862                        IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
 863                        break;
 864
 865                case ENETUNREACH:
 866                        IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 867                        break;
 868                }
 869                goto out;
 870        }
 871
 872        switch (rt->dst.error) {
 873        case EINVAL:
 874        default:
 875                goto out;
 876        case EHOSTUNREACH:
 877                code = ICMP_HOST_UNREACH;
 878                break;
 879        case ENETUNREACH:
 880                code = ICMP_NET_UNREACH;
 881                IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 882                break;
 883        case EACCES:
 884                code = ICMP_PKT_FILTERED;
 885                break;
 886        }
 887
 888        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 889
 890        send = true;
 891        if (peer) {
 892                now = jiffies;
 893                peer->rate_tokens += now - peer->rate_last;
 894                if (peer->rate_tokens > ip_rt_error_burst)
 895                        peer->rate_tokens = ip_rt_error_burst;
 896                peer->rate_last = now;
 897                if (peer->rate_tokens >= ip_rt_error_cost)
 898                        peer->rate_tokens -= ip_rt_error_cost;
 899                else
 900                        send = false;
 901                inet_putpeer(peer);
 902        }
 903        if (send)
 904                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 905
 906out:    kfree_skb(skb);
 907        return 0;
 908}
 909
 910static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 911{
 912        struct dst_entry *dst = &rt->dst;
 913        struct fib_result res;
 914
 915        if (dst_metric_locked(dst, RTAX_MTU))
 916                return;
 917
 918        if (dst->dev->mtu < mtu)
 919                return;
 920
 921        if (mtu < ip_rt_min_pmtu)
 922                mtu = ip_rt_min_pmtu;
 923
 924        if (!rt->rt_pmtu) {
 925                dst->obsolete = DST_OBSOLETE_KILL;
 926        } else {
 927                rt->rt_pmtu = mtu;
 928                dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
 929        }
 930
 931        rcu_read_lock();
 932        if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
 933                struct fib_nh *nh = &FIB_RES_NH(res);
 934
 935                update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
 936                                      jiffies + ip_rt_mtu_expires);
 937        }
 938        rcu_read_unlock();
 939}
 940
 941static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 942                              struct sk_buff *skb, u32 mtu)
 943{
 944        struct rtable *rt = (struct rtable *) dst;
 945        struct flowi4 fl4;
 946
 947        ip_rt_build_flow_key(&fl4, sk, skb);
 948        __ip_rt_update_pmtu(rt, &fl4, mtu);
 949}
 950
 951void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
 952                      int oif, u32 mark, u8 protocol, int flow_flags)
 953{
 954        const struct iphdr *iph = (const struct iphdr *) skb->data;
 955        struct flowi4 fl4;
 956        struct rtable *rt;
 957
 958        __build_flow_key(&fl4, NULL, iph, oif,
 959                         RT_TOS(iph->tos), protocol, mark, flow_flags);
 960        rt = __ip_route_output_key(net, &fl4);
 961        if (!IS_ERR(rt)) {
 962                __ip_rt_update_pmtu(rt, &fl4, mtu);
 963                ip_rt_put(rt);
 964        }
 965}
 966EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
 967
 968static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 969{
 970        const struct iphdr *iph = (const struct iphdr *) skb->data;
 971        struct flowi4 fl4;
 972        struct rtable *rt;
 973
 974        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
 975        rt = __ip_route_output_key(sock_net(sk), &fl4);
 976        if (!IS_ERR(rt)) {
 977                __ip_rt_update_pmtu(rt, &fl4, mtu);
 978                ip_rt_put(rt);
 979        }
 980}
 981
 982void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 983{
 984        const struct iphdr *iph = (const struct iphdr *) skb->data;
 985        struct flowi4 fl4;
 986        struct rtable *rt;
 987        struct dst_entry *dst;
 988        bool new = false;
 989
 990        bh_lock_sock(sk);
 991        rt = (struct rtable *) __sk_dst_get(sk);
 992
 993        if (sock_owned_by_user(sk) || !rt) {
 994                __ipv4_sk_update_pmtu(skb, sk, mtu);
 995                goto out;
 996        }
 997
 998        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
 999
1000        if (!__sk_dst_check(sk, 0)) {
1001                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1002                if (IS_ERR(rt))
1003                        goto out;
1004
1005                new = true;
1006        }
1007
1008        __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1009
1010        dst = dst_check(&rt->dst, 0);
1011        if (!dst) {
1012                if (new)
1013                        dst_release(&rt->dst);
1014
1015                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1016                if (IS_ERR(rt))
1017                        goto out;
1018
1019                new = true;
1020        }
1021
1022        if (new)
1023                __sk_dst_set(sk, &rt->dst);
1024
1025out:
1026        bh_unlock_sock(sk);
1027}
1028EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1029
1030void ipv4_redirect(struct sk_buff *skb, struct net *net,
1031                   int oif, u32 mark, u8 protocol, int flow_flags)
1032{
1033        const struct iphdr *iph = (const struct iphdr *) skb->data;
1034        struct flowi4 fl4;
1035        struct rtable *rt;
1036
1037        __build_flow_key(&fl4, NULL, iph, oif,
1038                         RT_TOS(iph->tos), protocol, mark, flow_flags);
1039        rt = __ip_route_output_key(net, &fl4);
1040        if (!IS_ERR(rt)) {
1041                __ip_do_redirect(rt, skb, &fl4, false);
1042                ip_rt_put(rt);
1043        }
1044}
1045EXPORT_SYMBOL_GPL(ipv4_redirect);
1046
1047void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1048{
1049        const struct iphdr *iph = (const struct iphdr *) skb->data;
1050        struct flowi4 fl4;
1051        struct rtable *rt;
1052
1053        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1054        rt = __ip_route_output_key(sock_net(sk), &fl4);
1055        if (!IS_ERR(rt)) {
1056                __ip_do_redirect(rt, skb, &fl4, false);
1057                ip_rt_put(rt);
1058        }
1059}
1060EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1061
1062static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1063{
1064        struct rtable *rt = (struct rtable *) dst;
1065
1066        /* All IPV4 dsts are created with ->obsolete set to the value
1067         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1068         * into this function always.
1069         *
1070         * When a PMTU/redirect information update invalidates a
1071         * route, this is indicated by setting obsolete to
1072         * DST_OBSOLETE_KILL.
1073         */
1074        if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1075                return NULL;
1076        return dst;
1077}
1078
1079static void ipv4_link_failure(struct sk_buff *skb)
1080{
1081        struct rtable *rt;
1082
1083        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1084
1085        rt = skb_rtable(skb);
1086        if (rt)
1087                dst_set_expires(&rt->dst, 0);
1088}
1089
1090static int ip_rt_bug(struct sk_buff *skb)
1091{
1092        pr_debug("%s: %pI4 -> %pI4, %s\n",
1093                 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1094                 skb->dev ? skb->dev->name : "?");
1095        kfree_skb(skb);
1096        WARN_ON(1);
1097        return 0;
1098}
1099
1100/*
1101   We do not cache source address of outgoing interface,
1102   because it is used only by IP RR, TS and SRR options,
1103   so that it out of fast path.
1104
1105   BTW remember: "addr" is allowed to be not aligned
1106   in IP options!
1107 */
1108
1109void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1110{
1111        __be32 src;
1112
1113        if (rt_is_output_route(rt))
1114                src = ip_hdr(skb)->saddr;
1115        else {
1116                struct fib_result res;
1117                struct flowi4 fl4;
1118                struct iphdr *iph;
1119
1120                iph = ip_hdr(skb);
1121
1122                memset(&fl4, 0, sizeof(fl4));
1123                fl4.daddr = iph->daddr;
1124                fl4.saddr = iph->saddr;
1125                fl4.flowi4_tos = RT_TOS(iph->tos);
1126                fl4.flowi4_oif = rt->dst.dev->ifindex;
1127                fl4.flowi4_iif = skb->dev->ifindex;
1128                fl4.flowi4_mark = skb->mark;
1129
1130                rcu_read_lock();
1131                if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1132                        src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1133                else
1134                        src = inet_select_addr(rt->dst.dev,
1135                                               rt_nexthop(rt, iph->daddr),
1136                                               RT_SCOPE_UNIVERSE);
1137                rcu_read_unlock();
1138        }
1139        memcpy(addr, &src, 4);
1140}
1141
1142#ifdef CONFIG_IP_ROUTE_CLASSID
1143static void set_class_tag(struct rtable *rt, u32 tag)
1144{
1145        if (!(rt->dst.tclassid & 0xFFFF))
1146                rt->dst.tclassid |= tag & 0xFFFF;
1147        if (!(rt->dst.tclassid & 0xFFFF0000))
1148                rt->dst.tclassid |= tag & 0xFFFF0000;
1149}
1150#endif
1151
1152static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1153{
1154        unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1155
1156        if (advmss == 0) {
1157                advmss = max_t(unsigned int, dst->dev->mtu - 40,
1158                               ip_rt_min_advmss);
1159                if (advmss > 65535 - 40)
1160                        advmss = 65535 - 40;
1161        }
1162        return advmss;
1163}
1164
1165static unsigned int ipv4_mtu(const struct dst_entry *dst)
1166{
1167        const struct rtable *rt = (const struct rtable *) dst;
1168        unsigned int mtu = rt->rt_pmtu;
1169
1170        if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1171                mtu = dst_metric_raw(dst, RTAX_MTU);
1172
1173        if (mtu)
1174                return mtu;
1175
1176        mtu = dst->dev->mtu;
1177
1178        if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1179                if (rt->rt_uses_gateway && mtu > 576)
1180                        mtu = 576;
1181        }
1182
1183        if (mtu > IP_MAX_MTU)
1184                mtu = IP_MAX_MTU;
1185
1186        return mtu;
1187}
1188
1189static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1190{
1191        struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1192        struct fib_nh_exception *fnhe;
1193        u32 hval;
1194
1195        if (!hash)
1196                return NULL;
1197
1198        hval = fnhe_hashfun(daddr);
1199
1200        for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1201             fnhe = rcu_dereference(fnhe->fnhe_next)) {
1202                if (fnhe->fnhe_daddr == daddr)
1203                        return fnhe;
1204        }
1205        return NULL;
1206}
1207
1208static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1209                              __be32 daddr)
1210{
1211        bool ret = false;
1212
1213        spin_lock_bh(&fnhe_lock);
1214
1215        if (daddr == fnhe->fnhe_daddr) {
1216                struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
1217                if (orig && rt_is_expired(orig)) {
1218                        fnhe->fnhe_gw = 0;
1219                        fnhe->fnhe_pmtu = 0;
1220                        fnhe->fnhe_expires = 0;
1221                }
1222                if (fnhe->fnhe_pmtu) {
1223                        unsigned long expires = fnhe->fnhe_expires;
1224                        unsigned long diff = expires - jiffies;
1225
1226                        if (time_before(jiffies, expires)) {
1227                                rt->rt_pmtu = fnhe->fnhe_pmtu;
1228                                dst_set_expires(&rt->dst, diff);
1229                        }
1230                }
1231                if (fnhe->fnhe_gw) {
1232                        rt->rt_flags |= RTCF_REDIRECTED;
1233                        rt->rt_gateway = fnhe->fnhe_gw;
1234                        rt->rt_uses_gateway = 1;
1235                } else if (!rt->rt_gateway)
1236                        rt->rt_gateway = daddr;
1237
1238                rcu_assign_pointer(fnhe->fnhe_rth, rt);
1239                if (orig)
1240                        rt_free(orig);
1241
1242                fnhe->fnhe_stamp = jiffies;
1243                ret = true;
1244        }
1245        spin_unlock_bh(&fnhe_lock);
1246
1247        return ret;
1248}
1249
1250static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1251{
1252        struct rtable *orig, *prev, **p;
1253        bool ret = true;
1254
1255        if (rt_is_input_route(rt)) {
1256                p = (struct rtable **)&nh->nh_rth_input;
1257        } else {
1258                p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1259        }
1260        orig = *p;
1261
1262        prev = cmpxchg(p, orig, rt);
1263        if (prev == orig) {
1264                if (orig)
1265                        rt_free(orig);
1266        } else
1267                ret = false;
1268
1269        return ret;
1270}
1271
1272static DEFINE_SPINLOCK(rt_uncached_lock);
1273static LIST_HEAD(rt_uncached_list);
1274
1275static void rt_add_uncached_list(struct rtable *rt)
1276{
1277        spin_lock_bh(&rt_uncached_lock);
1278        list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1279        spin_unlock_bh(&rt_uncached_lock);
1280}
1281
1282static void ipv4_dst_destroy(struct dst_entry *dst)
1283{
1284        struct rtable *rt = (struct rtable *) dst;
1285
1286        if (!list_empty(&rt->rt_uncached)) {
1287                spin_lock_bh(&rt_uncached_lock);
1288                list_del(&rt->rt_uncached);
1289                spin_unlock_bh(&rt_uncached_lock);
1290        }
1291}
1292
1293void rt_flush_dev(struct net_device *dev)
1294{
1295        if (!list_empty(&rt_uncached_list)) {
1296                struct net *net = dev_net(dev);
1297                struct rtable *rt;
1298
1299                spin_lock_bh(&rt_uncached_lock);
1300                list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1301                        if (rt->dst.dev != dev)
1302                                continue;
1303                        rt->dst.dev = net->loopback_dev;
1304                        dev_hold(rt->dst.dev);
1305                        dev_put(dev);
1306                }
1307                spin_unlock_bh(&rt_uncached_lock);
1308        }
1309}
1310
1311static bool rt_cache_valid(const struct rtable *rt)
1312{
1313        return  rt &&
1314                rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1315                !rt_is_expired(rt);
1316}
1317
1318static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1319                           const struct fib_result *res,
1320                           struct fib_nh_exception *fnhe,
1321                           struct fib_info *fi, u16 type, u32 itag)
1322{
1323        bool cached = false;
1324
1325        if (fi) {
1326                struct fib_nh *nh = &FIB_RES_NH(*res);
1327
1328                if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1329                        rt->rt_gateway = nh->nh_gw;
1330                        rt->rt_uses_gateway = 1;
1331                }
1332                dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1333#ifdef CONFIG_IP_ROUTE_CLASSID
1334                rt->dst.tclassid = nh->nh_tclassid;
1335#endif
1336                if (unlikely(fnhe))
1337                        cached = rt_bind_exception(rt, fnhe, daddr);
1338                else if (!(rt->dst.flags & DST_NOCACHE))
1339                        cached = rt_cache_route(nh, rt);
1340                if (unlikely(!cached)) {
1341                        /* Routes we intend to cache in nexthop exception or
1342                         * FIB nexthop have the DST_NOCACHE bit clear.
1343                         * However, if we are unsuccessful at storing this
1344                         * route into the cache we really need to set it.
1345                         */
1346                        rt->dst.flags |= DST_NOCACHE;
1347                        if (!rt->rt_gateway)
1348                                rt->rt_gateway = daddr;
1349                        rt_add_uncached_list(rt);
1350                }
1351        } else
1352                rt_add_uncached_list(rt);
1353
1354#ifdef CONFIG_IP_ROUTE_CLASSID
1355#ifdef CONFIG_IP_MULTIPLE_TABLES
1356        set_class_tag(rt, res->tclassid);
1357#endif
1358        set_class_tag(rt, itag);
1359#endif
1360}
1361
1362static struct rtable *rt_dst_alloc(struct net_device *dev,
1363                                   bool nopolicy, bool noxfrm, bool will_cache)
1364{
1365        return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1366                         (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1367                         (nopolicy ? DST_NOPOLICY : 0) |
1368                         (noxfrm ? DST_NOXFRM : 0));
1369}
1370
1371/* called in rcu_read_lock() section */
1372static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1373                                u8 tos, struct net_device *dev, int our)
1374{
1375        struct rtable *rth;
1376        struct in_device *in_dev = __in_dev_get_rcu(dev);
1377        u32 itag = 0;
1378        int err;
1379
1380        /* Primary sanity checks. */
1381
1382        if (in_dev == NULL)
1383                return -EINVAL;
1384
1385        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1386            skb->protocol != htons(ETH_P_IP))
1387                goto e_inval;
1388
1389        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1390                if (ipv4_is_loopback(saddr))
1391                        goto e_inval;
1392
1393        if (ipv4_is_zeronet(saddr)) {
1394                if (!ipv4_is_local_multicast(daddr))
1395                        goto e_inval;
1396        } else {
1397                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1398                                          in_dev, &itag);
1399                if (err < 0)
1400                        goto e_err;
1401        }
1402        rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1403                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1404        if (!rth)
1405                goto e_nobufs;
1406
1407#ifdef CONFIG_IP_ROUTE_CLASSID
1408        rth->dst.tclassid = itag;
1409#endif
1410        rth->dst.output = ip_rt_bug;
1411
1412        rth->rt_genid   = rt_genid(dev_net(dev));
1413        rth->rt_flags   = RTCF_MULTICAST;
1414        rth->rt_type    = RTN_MULTICAST;
1415        rth->rt_is_input= 1;
1416        rth->rt_iif     = 0;
1417        rth->rt_pmtu    = 0;
1418        rth->rt_gateway = 0;
1419        rth->rt_uses_gateway = 0;
1420        INIT_LIST_HEAD(&rth->rt_uncached);
1421        if (our) {
1422                rth->dst.input= ip_local_deliver;
1423                rth->rt_flags |= RTCF_LOCAL;
1424        }
1425
1426#ifdef CONFIG_IP_MROUTE
1427        if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1428                rth->dst.input = ip_mr_input;
1429#endif
1430        RT_CACHE_STAT_INC(in_slow_mc);
1431
1432        skb_dst_set(skb, &rth->dst);
1433        return 0;
1434
1435e_nobufs:
1436        return -ENOBUFS;
1437e_inval:
1438        return -EINVAL;
1439e_err:
1440        return err;
1441}
1442
1443
1444static void ip_handle_martian_source(struct net_device *dev,
1445                                     struct in_device *in_dev,
1446                                     struct sk_buff *skb,
1447                                     __be32 daddr,
1448                                     __be32 saddr)
1449{
1450        RT_CACHE_STAT_INC(in_martian_src);
1451#ifdef CONFIG_IP_ROUTE_VERBOSE
1452        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1453                /*
1454                 *      RFC1812 recommendation, if source is martian,
1455                 *      the only hint is MAC header.
1456                 */
1457                pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1458                        &daddr, &saddr, dev->name);
1459                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1460                        print_hex_dump(KERN_WARNING, "ll header: ",
1461                                       DUMP_PREFIX_OFFSET, 16, 1,
1462                                       skb_mac_header(skb),
1463                                       dev->hard_header_len, true);
1464                }
1465        }
1466#endif
1467}
1468
1469/* called in rcu_read_lock() section */
1470static int __mkroute_input(struct sk_buff *skb,
1471                           const struct fib_result *res,
1472                           struct in_device *in_dev,
1473                           __be32 daddr, __be32 saddr, u32 tos)
1474{
1475        struct rtable *rth;
1476        int err;
1477        struct in_device *out_dev;
1478        unsigned int flags = 0;
1479        bool do_cache;
1480        u32 itag;
1481
1482        /* get a working reference to the output device */
1483        out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1484        if (out_dev == NULL) {
1485                net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1486                return -EINVAL;
1487        }
1488
1489        err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1490                                  in_dev->dev, in_dev, &itag);
1491        if (err < 0) {
1492                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1493                                         saddr);
1494
1495                goto cleanup;
1496        }
1497
1498        do_cache = res->fi && !itag;
1499        if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1500            (IN_DEV_SHARED_MEDIA(out_dev) ||
1501             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1502                flags |= RTCF_DOREDIRECT;
1503                do_cache = false;
1504        }
1505
1506        if (skb->protocol != htons(ETH_P_IP)) {
1507                /* Not IP (i.e. ARP). Do not create route, if it is
1508                 * invalid for proxy arp. DNAT routes are always valid.
1509                 *
1510                 * Proxy arp feature have been extended to allow, ARP
1511                 * replies back to the same interface, to support
1512                 * Private VLAN switch technologies. See arp.c.
1513                 */
1514                if (out_dev == in_dev &&
1515                    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1516                        err = -EINVAL;
1517                        goto cleanup;
1518                }
1519        }
1520
1521        if (do_cache) {
1522                rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1523                if (rt_cache_valid(rth)) {
1524                        skb_dst_set_noref(skb, &rth->dst);
1525                        goto out;
1526                }
1527        }
1528
1529        rth = rt_dst_alloc(out_dev->dev,
1530                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
1531                           IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1532        if (!rth) {
1533                err = -ENOBUFS;
1534                goto cleanup;
1535        }
1536
1537        rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1538        rth->rt_flags = flags;
1539        rth->rt_type = res->type;
1540        rth->rt_is_input = 1;
1541        rth->rt_iif     = 0;
1542        rth->rt_pmtu    = 0;
1543        rth->rt_gateway = 0;
1544        rth->rt_uses_gateway = 0;
1545        INIT_LIST_HEAD(&rth->rt_uncached);
1546
1547        rth->dst.input = ip_forward;
1548        rth->dst.output = ip_output;
1549
1550        rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1551        skb_dst_set(skb, &rth->dst);
1552out:
1553        err = 0;
1554 cleanup:
1555        return err;
1556}
1557
1558static int ip_mkroute_input(struct sk_buff *skb,
1559                            struct fib_result *res,
1560                            const struct flowi4 *fl4,
1561                            struct in_device *in_dev,
1562                            __be32 daddr, __be32 saddr, u32 tos)
1563{
1564#ifdef CONFIG_IP_ROUTE_MULTIPATH
1565        if (res->fi && res->fi->fib_nhs > 1)
1566                fib_select_multipath(res);
1567#endif
1568
1569        /* create a routing cache entry */
1570        return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1571}
1572
1573/*
1574 *      NOTE. We drop all the packets that has local source
1575 *      addresses, because every properly looped back packet
1576 *      must have correct destination already attached by output routine.
1577 *
1578 *      Such approach solves two big problems:
1579 *      1. Not simplex devices are handled properly.
1580 *      2. IP spoofing attempts are filtered with 100% of guarantee.
1581 *      called with rcu_read_lock()
1582 */
1583
1584static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1585                               u8 tos, struct net_device *dev)
1586{
1587        struct fib_result res;
1588        struct in_device *in_dev = __in_dev_get_rcu(dev);
1589        struct flowi4   fl4;
1590        unsigned int    flags = 0;
1591        u32             itag = 0;
1592        struct rtable   *rth;
1593        int             err = -EINVAL;
1594        struct net    *net = dev_net(dev);
1595        bool do_cache;
1596
1597        /* IP on this device is disabled. */
1598
1599        if (!in_dev)
1600                goto out;
1601
1602        /* Check for the most weird martians, which can be not detected
1603           by fib_lookup.
1604         */
1605
1606        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1607                goto martian_source;
1608
1609        res.fi = NULL;
1610        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1611                goto brd_input;
1612
1613        /* Accept zero addresses only to limited broadcast;
1614         * I even do not know to fix it or not. Waiting for complains :-)
1615         */
1616        if (ipv4_is_zeronet(saddr))
1617                goto martian_source;
1618
1619        if (ipv4_is_zeronet(daddr))
1620                goto martian_destination;
1621
1622        /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1623         * and call it once if daddr or/and saddr are loopback addresses
1624         */
1625        if (ipv4_is_loopback(daddr)) {
1626                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1627                        goto martian_destination;
1628        } else if (ipv4_is_loopback(saddr)) {
1629                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1630                        goto martian_source;
1631        }
1632
1633        /*
1634         *      Now we are ready to route packet.
1635         */
1636        fl4.flowi4_oif = 0;
1637        fl4.flowi4_iif = dev->ifindex;
1638        fl4.flowi4_mark = skb->mark;
1639        fl4.flowi4_tos = tos;
1640        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1641        fl4.daddr = daddr;
1642        fl4.saddr = saddr;
1643        err = fib_lookup(net, &fl4, &res);
1644        if (err != 0)
1645                goto no_route;
1646
1647        RT_CACHE_STAT_INC(in_slow_tot);
1648
1649        if (res.type == RTN_BROADCAST)
1650                goto brd_input;
1651
1652        if (res.type == RTN_LOCAL) {
1653                err = fib_validate_source(skb, saddr, daddr, tos,
1654                                          LOOPBACK_IFINDEX,
1655                                          dev, in_dev, &itag);
1656                if (err < 0)
1657                        goto martian_source_keep_err;
1658                goto local_input;
1659        }
1660
1661        if (!IN_DEV_FORWARD(in_dev))
1662                goto no_route;
1663        if (res.type != RTN_UNICAST)
1664                goto martian_destination;
1665
1666        err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1667out:    return err;
1668
1669brd_input:
1670        if (skb->protocol != htons(ETH_P_IP))
1671                goto e_inval;
1672
1673        if (!ipv4_is_zeronet(saddr)) {
1674                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1675                                          in_dev, &itag);
1676                if (err < 0)
1677                        goto martian_source_keep_err;
1678        }
1679        flags |= RTCF_BROADCAST;
1680        res.type = RTN_BROADCAST;
1681        RT_CACHE_STAT_INC(in_brd);
1682
1683local_input:
1684        do_cache = false;
1685        if (res.fi) {
1686                if (!itag) {
1687                        rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1688                        if (rt_cache_valid(rth)) {
1689                                skb_dst_set_noref(skb, &rth->dst);
1690                                err = 0;
1691                                goto out;
1692                        }
1693                        do_cache = true;
1694                }
1695        }
1696
1697        rth = rt_dst_alloc(net->loopback_dev,
1698                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1699        if (!rth)
1700                goto e_nobufs;
1701
1702        rth->dst.input= ip_local_deliver;
1703        rth->dst.output= ip_rt_bug;
1704#ifdef CONFIG_IP_ROUTE_CLASSID
1705        rth->dst.tclassid = itag;
1706#endif
1707
1708        rth->rt_genid = rt_genid(net);
1709        rth->rt_flags   = flags|RTCF_LOCAL;
1710        rth->rt_type    = res.type;
1711        rth->rt_is_input = 1;
1712        rth->rt_iif     = 0;
1713        rth->rt_pmtu    = 0;
1714        rth->rt_gateway = 0;
1715        rth->rt_uses_gateway = 0;
1716        INIT_LIST_HEAD(&rth->rt_uncached);
1717        if (res.type == RTN_UNREACHABLE) {
1718                rth->dst.input= ip_error;
1719                rth->dst.error= -err;
1720                rth->rt_flags   &= ~RTCF_LOCAL;
1721        }
1722        if (do_cache)
1723                rt_cache_route(&FIB_RES_NH(res), rth);
1724        skb_dst_set(skb, &rth->dst);
1725        err = 0;
1726        goto out;
1727
1728no_route:
1729        RT_CACHE_STAT_INC(in_no_route);
1730        res.type = RTN_UNREACHABLE;
1731        if (err == -ESRCH)
1732                err = -ENETUNREACH;
1733        goto local_input;
1734
1735        /*
1736         *      Do not cache martian addresses: they should be logged (RFC1812)
1737         */
1738martian_destination:
1739        RT_CACHE_STAT_INC(in_martian_dst);
1740#ifdef CONFIG_IP_ROUTE_VERBOSE
1741        if (IN_DEV_LOG_MARTIANS(in_dev))
1742                net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1743                                     &daddr, &saddr, dev->name);
1744#endif
1745
1746e_inval:
1747        err = -EINVAL;
1748        goto out;
1749
1750e_nobufs:
1751        err = -ENOBUFS;
1752        goto out;
1753
1754martian_source:
1755        err = -EINVAL;
1756martian_source_keep_err:
1757        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1758        goto out;
1759}
1760
1761int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1762                         u8 tos, struct net_device *dev)
1763{
1764        int res;
1765
1766        rcu_read_lock();
1767
1768        /* Multicast recognition logic is moved from route cache to here.
1769           The problem was that too many Ethernet cards have broken/missing
1770           hardware multicast filters :-( As result the host on multicasting
1771           network acquires a lot of useless route cache entries, sort of
1772           SDR messages from all the world. Now we try to get rid of them.
1773           Really, provided software IP multicast filter is organized
1774           reasonably (at least, hashed), it does not result in a slowdown
1775           comparing with route cache reject entries.
1776           Note, that multicast routers are not affected, because
1777           route cache entry is created eventually.
1778         */
1779        if (ipv4_is_multicast(daddr)) {
1780                struct in_device *in_dev = __in_dev_get_rcu(dev);
1781
1782                if (in_dev) {
1783                        int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1784                                                  ip_hdr(skb)->protocol);
1785                        if (our
1786#ifdef CONFIG_IP_MROUTE
1787                                ||
1788                            (!ipv4_is_local_multicast(daddr) &&
1789                             IN_DEV_MFORWARD(in_dev))
1790#endif
1791                           ) {
1792                                int res = ip_route_input_mc(skb, daddr, saddr,
1793                                                            tos, dev, our);
1794                                rcu_read_unlock();
1795                                return res;
1796                        }
1797                }
1798                rcu_read_unlock();
1799                return -EINVAL;
1800        }
1801        res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1802        rcu_read_unlock();
1803        return res;
1804}
1805EXPORT_SYMBOL(ip_route_input_noref);
1806
1807/* called with rcu_read_lock() */
1808static struct rtable *__mkroute_output(const struct fib_result *res,
1809                                       const struct flowi4 *fl4, int orig_oif,
1810                                       struct net_device *dev_out,
1811                                       unsigned int flags)
1812{
1813        struct fib_info *fi = res->fi;
1814        struct fib_nh_exception *fnhe;
1815        struct in_device *in_dev;
1816        u16 type = res->type;
1817        struct rtable *rth;
1818        bool do_cache;
1819
1820        in_dev = __in_dev_get_rcu(dev_out);
1821        if (!in_dev)
1822                return ERR_PTR(-EINVAL);
1823
1824        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1825                if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1826                        return ERR_PTR(-EINVAL);
1827
1828        if (ipv4_is_lbcast(fl4->daddr))
1829                type = RTN_BROADCAST;
1830        else if (ipv4_is_multicast(fl4->daddr))
1831                type = RTN_MULTICAST;
1832        else if (ipv4_is_zeronet(fl4->daddr))
1833                return ERR_PTR(-EINVAL);
1834
1835        if (dev_out->flags & IFF_LOOPBACK)
1836                flags |= RTCF_LOCAL;
1837
1838        do_cache = true;
1839        if (type == RTN_BROADCAST) {
1840                flags |= RTCF_BROADCAST | RTCF_LOCAL;
1841                fi = NULL;
1842        } else if (type == RTN_MULTICAST) {
1843                flags |= RTCF_MULTICAST | RTCF_LOCAL;
1844                if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1845                                     fl4->flowi4_proto))
1846                        flags &= ~RTCF_LOCAL;
1847                else
1848                        do_cache = false;
1849                /* If multicast route do not exist use
1850                 * default one, but do not gateway in this case.
1851                 * Yes, it is hack.
1852                 */
1853                if (fi && res->prefixlen < 4)
1854                        fi = NULL;
1855        }
1856
1857        fnhe = NULL;
1858        do_cache &= fi != NULL;
1859        if (do_cache) {
1860                struct rtable __rcu **prth;
1861                struct fib_nh *nh = &FIB_RES_NH(*res);
1862
1863                fnhe = find_exception(nh, fl4->daddr);
1864                if (fnhe)
1865                        prth = &fnhe->fnhe_rth;
1866                else {
1867                        if (unlikely(fl4->flowi4_flags &
1868                                     FLOWI_FLAG_KNOWN_NH &&
1869                                     !(nh->nh_gw &&
1870                                       nh->nh_scope == RT_SCOPE_LINK))) {
1871                                do_cache = false;
1872                                goto add;
1873                        }
1874                        prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1875                }
1876                rth = rcu_dereference(*prth);
1877                if (rt_cache_valid(rth)) {
1878                        dst_hold(&rth->dst);
1879                        return rth;
1880                }
1881        }
1882
1883add:
1884        rth = rt_dst_alloc(dev_out,
1885                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
1886                           IN_DEV_CONF_GET(in_dev, NOXFRM),
1887                           do_cache);
1888        if (!rth)
1889                return ERR_PTR(-ENOBUFS);
1890
1891        rth->dst.output = ip_output;
1892
1893        rth->rt_genid = rt_genid(dev_net(dev_out));
1894        rth->rt_flags   = flags;
1895        rth->rt_type    = type;
1896        rth->rt_is_input = 0;
1897        rth->rt_iif     = orig_oif ? : 0;
1898        rth->rt_pmtu    = 0;
1899        rth->rt_gateway = 0;
1900        rth->rt_uses_gateway = 0;
1901        INIT_LIST_HEAD(&rth->rt_uncached);
1902
1903        RT_CACHE_STAT_INC(out_slow_tot);
1904
1905        if (flags & RTCF_LOCAL)
1906                rth->dst.input = ip_local_deliver;
1907        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1908                if (flags & RTCF_LOCAL &&
1909                    !(dev_out->flags & IFF_LOOPBACK)) {
1910                        rth->dst.output = ip_mc_output;
1911                        RT_CACHE_STAT_INC(out_slow_mc);
1912                }
1913#ifdef CONFIG_IP_MROUTE
1914                if (type == RTN_MULTICAST) {
1915                        if (IN_DEV_MFORWARD(in_dev) &&
1916                            !ipv4_is_local_multicast(fl4->daddr)) {
1917                                rth->dst.input = ip_mr_input;
1918                                rth->dst.output = ip_mc_output;
1919                        }
1920                }
1921#endif
1922        }
1923
1924        rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1925
1926        return rth;
1927}
1928
1929/*
1930 * Major route resolver routine.
1931 */
1932
1933struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1934{
1935        struct net_device *dev_out = NULL;
1936        __u8 tos = RT_FL_TOS(fl4);
1937        unsigned int flags = 0;
1938        struct fib_result res;
1939        struct rtable *rth;
1940        int orig_oif;
1941
1942        res.tclassid    = 0;
1943        res.fi          = NULL;
1944        res.table       = NULL;
1945
1946        orig_oif = fl4->flowi4_oif;
1947
1948        fl4->flowi4_iif = LOOPBACK_IFINDEX;
1949        fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1950        fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1951                         RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1952
1953        rcu_read_lock();
1954        if (fl4->saddr) {
1955                rth = ERR_PTR(-EINVAL);
1956                if (ipv4_is_multicast(fl4->saddr) ||
1957                    ipv4_is_lbcast(fl4->saddr) ||
1958                    ipv4_is_zeronet(fl4->saddr))
1959                        goto out;
1960
1961                /* I removed check for oif == dev_out->oif here.
1962                   It was wrong for two reasons:
1963                   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1964                      is assigned to multiple interfaces.
1965                   2. Moreover, we are allowed to send packets with saddr
1966                      of another iface. --ANK
1967                 */
1968
1969                if (fl4->flowi4_oif == 0 &&
1970                    (ipv4_is_multicast(fl4->daddr) ||
1971                     ipv4_is_lbcast(fl4->daddr))) {
1972                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1973                        dev_out = __ip_dev_find(net, fl4->saddr, false);
1974                        if (dev_out == NULL)
1975                                goto out;
1976
1977                        /* Special hack: user can direct multicasts
1978                           and limited broadcast via necessary interface
1979                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1980                           This hack is not just for fun, it allows
1981                           vic,vat and friends to work.
1982                           They bind socket to loopback, set ttl to zero
1983                           and expect that it will work.
1984                           From the viewpoint of routing cache they are broken,
1985                           because we are not allowed to build multicast path
1986                           with loopback source addr (look, routing cache
1987                           cannot know, that ttl is zero, so that packet
1988                           will not leave this host and route is valid).
1989                           Luckily, this hack is good workaround.
1990                         */
1991
1992                        fl4->flowi4_oif = dev_out->ifindex;
1993                        goto make_route;
1994                }
1995
1996                if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1997                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1998                        if (!__ip_dev_find(net, fl4->saddr, false))
1999                                goto out;
2000                }
2001        }
2002
2003
2004        if (fl4->flowi4_oif) {
2005                dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2006                rth = ERR_PTR(-ENODEV);
2007                if (dev_out == NULL)
2008                        goto out;
2009
2010                /* RACE: Check return value of inet_select_addr instead. */
2011                if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2012                        rth = ERR_PTR(-ENETUNREACH);
2013                        goto out;
2014                }
2015                if (ipv4_is_local_multicast(fl4->daddr) ||
2016                    ipv4_is_lbcast(fl4->daddr)) {
2017                        if (!fl4->saddr)
2018                                fl4->saddr = inet_select_addr(dev_out, 0,
2019                                                              RT_SCOPE_LINK);
2020                        goto make_route;
2021                }
2022                if (fl4->saddr) {
2023                        if (ipv4_is_multicast(fl4->daddr))
2024                                fl4->saddr = inet_select_addr(dev_out, 0,
2025                                                              fl4->flowi4_scope);
2026                        else if (!fl4->daddr)
2027                                fl4->saddr = inet_select_addr(dev_out, 0,
2028                                                              RT_SCOPE_HOST);
2029                }
2030        }
2031
2032        if (!fl4->daddr) {
2033                fl4->daddr = fl4->saddr;
2034                if (!fl4->daddr)
2035                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2036                dev_out = net->loopback_dev;
2037                fl4->flowi4_oif = LOOPBACK_IFINDEX;
2038                res.type = RTN_LOCAL;
2039                flags |= RTCF_LOCAL;
2040                goto make_route;
2041        }
2042
2043        if (fib_lookup(net, fl4, &res)) {
2044                res.fi = NULL;
2045                res.table = NULL;
2046                if (fl4->flowi4_oif) {
2047                        /* Apparently, routing tables are wrong. Assume,
2048                           that the destination is on link.
2049
2050                           WHY? DW.
2051                           Because we are allowed to send to iface
2052                           even if it has NO routes and NO assigned
2053                           addresses. When oif is specified, routing
2054                           tables are looked up with only one purpose:
2055                           to catch if destination is gatewayed, rather than
2056                           direct. Moreover, if MSG_DONTROUTE is set,
2057                           we send packet, ignoring both routing tables
2058                           and ifaddr state. --ANK
2059
2060
2061                           We could make it even if oif is unknown,
2062                           likely IPv6, but we do not.
2063                         */
2064
2065                        if (fl4->saddr == 0)
2066                                fl4->saddr = inet_select_addr(dev_out, 0,
2067                                                              RT_SCOPE_LINK);
2068                        res.type = RTN_UNICAST;
2069                        goto make_route;
2070                }
2071                rth = ERR_PTR(-ENETUNREACH);
2072                goto out;
2073        }
2074
2075        if (res.type == RTN_LOCAL) {
2076                if (!fl4->saddr) {
2077                        if (res.fi->fib_prefsrc)
2078                                fl4->saddr = res.fi->fib_prefsrc;
2079                        else
2080                                fl4->saddr = fl4->daddr;
2081                }
2082                dev_out = net->loopback_dev;
2083                fl4->flowi4_oif = dev_out->ifindex;
2084                flags |= RTCF_LOCAL;
2085                goto make_route;
2086        }
2087
2088#ifdef CONFIG_IP_ROUTE_MULTIPATH
2089        if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2090                fib_select_multipath(&res);
2091        else
2092#endif
2093        if (!res.prefixlen &&
2094            res.table->tb_num_default > 1 &&
2095            res.type == RTN_UNICAST && !fl4->flowi4_oif)
2096                fib_select_default(&res);
2097
2098        if (!fl4->saddr)
2099                fl4->saddr = FIB_RES_PREFSRC(net, res);
2100
2101        dev_out = FIB_RES_DEV(res);
2102        fl4->flowi4_oif = dev_out->ifindex;
2103
2104
2105make_route:
2106        rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2107
2108out:
2109        rcu_read_unlock();
2110        return rth;
2111}
2112EXPORT_SYMBOL_GPL(__ip_route_output_key);
2113
2114static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2115{
2116        return NULL;
2117}
2118
2119static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2120{
2121        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2122
2123        return mtu ? : dst->dev->mtu;
2124}
2125
2126static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2127                                          struct sk_buff *skb, u32 mtu)
2128{
2129}
2130
2131static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2132                                       struct sk_buff *skb)
2133{
2134}
2135
2136static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2137                                          unsigned long old)
2138{
2139        return NULL;
2140}
2141
2142static struct dst_ops ipv4_dst_blackhole_ops = {
2143        .family                 =       AF_INET,
2144        .protocol               =       cpu_to_be16(ETH_P_IP),
2145        .check                  =       ipv4_blackhole_dst_check,
2146        .mtu                    =       ipv4_blackhole_mtu,
2147        .default_advmss         =       ipv4_default_advmss,
2148        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2149        .redirect               =       ipv4_rt_blackhole_redirect,
2150        .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2151        .neigh_lookup           =       ipv4_neigh_lookup,
2152};
2153
2154struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2155{
2156        struct rtable *ort = (struct rtable *) dst_orig;
2157        struct rtable *rt;
2158
2159        rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2160        if (rt) {
2161                struct dst_entry *new = &rt->dst;
2162
2163                new->__use = 1;
2164                new->input = dst_discard;
2165                new->output = dst_discard;
2166
2167                new->dev = ort->dst.dev;
2168                if (new->dev)
2169                        dev_hold(new->dev);
2170
2171                rt->rt_is_input = ort->rt_is_input;
2172                rt->rt_iif = ort->rt_iif;
2173                rt->rt_pmtu = ort->rt_pmtu;
2174
2175                rt->rt_genid = rt_genid(net);
2176                rt->rt_flags = ort->rt_flags;
2177                rt->rt_type = ort->rt_type;
2178                rt->rt_gateway = ort->rt_gateway;
2179                rt->rt_uses_gateway = ort->rt_uses_gateway;
2180
2181                INIT_LIST_HEAD(&rt->rt_uncached);
2182
2183                dst_free(new);
2184        }
2185
2186        dst_release(dst_orig);
2187
2188        return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2189}
2190
2191struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2192                                    struct sock *sk)
2193{
2194        struct rtable *rt = __ip_route_output_key(net, flp4);
2195
2196        if (IS_ERR(rt))
2197                return rt;
2198
2199        if (flp4->flowi4_proto)
2200                rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2201                                                   flowi4_to_flowi(flp4),
2202                                                   sk, 0);
2203
2204        return rt;
2205}
2206EXPORT_SYMBOL_GPL(ip_route_output_flow);
2207
2208static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2209                        struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2210                        u32 seq, int event, int nowait, unsigned int flags)
2211{
2212        struct rtable *rt = skb_rtable(skb);
2213        struct rtmsg *r;
2214        struct nlmsghdr *nlh;
2215        unsigned long expires = 0;
2216        u32 error;
2217        u32 metrics[RTAX_MAX];
2218
2219        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2220        if (nlh == NULL)
2221                return -EMSGSIZE;
2222
2223        r = nlmsg_data(nlh);
2224        r->rtm_family    = AF_INET;
2225        r->rtm_dst_len  = 32;
2226        r->rtm_src_len  = 0;
2227        r->rtm_tos      = fl4->flowi4_tos;
2228        r->rtm_table    = RT_TABLE_MAIN;
2229        if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2230                goto nla_put_failure;
2231        r->rtm_type     = rt->rt_type;
2232        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2233        r->rtm_protocol = RTPROT_UNSPEC;
2234        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2235        if (rt->rt_flags & RTCF_NOTIFY)
2236                r->rtm_flags |= RTM_F_NOTIFY;
2237
2238        if (nla_put_be32(skb, RTA_DST, dst))
2239                goto nla_put_failure;
2240        if (src) {
2241                r->rtm_src_len = 32;
2242                if (nla_put_be32(skb, RTA_SRC, src))
2243                        goto nla_put_failure;
2244        }
2245        if (rt->dst.dev &&
2246            nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2247                goto nla_put_failure;
2248#ifdef CONFIG_IP_ROUTE_CLASSID
2249        if (rt->dst.tclassid &&
2250            nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2251                goto nla_put_failure;
2252#endif
2253        if (!rt_is_input_route(rt) &&
2254            fl4->saddr != src) {
2255                if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2256                        goto nla_put_failure;
2257        }
2258        if (rt->rt_uses_gateway &&
2259            nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2260                goto nla_put_failure;
2261
2262        expires = rt->dst.expires;
2263        if (expires) {
2264                unsigned long now = jiffies;
2265
2266                if (time_before(now, expires))
2267                        expires -= now;
2268                else
2269                        expires = 0;
2270        }
2271
2272        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2273        if (rt->rt_pmtu && expires)
2274                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2275        if (rtnetlink_put_metrics(skb, metrics) < 0)
2276                goto nla_put_failure;
2277
2278        if (fl4->flowi4_mark &&
2279            nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2280                goto nla_put_failure;
2281
2282        error = rt->dst.error;
2283
2284        if (rt_is_input_route(rt)) {
2285                if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2286                        goto nla_put_failure;
2287        }
2288
2289        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2290                goto nla_put_failure;
2291
2292        return nlmsg_end(skb, nlh);
2293
2294nla_put_failure:
2295        nlmsg_cancel(skb, nlh);
2296        return -EMSGSIZE;
2297}
2298
2299static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2300{
2301        struct net *net = sock_net(in_skb->sk);
2302        struct rtmsg *rtm;
2303        struct nlattr *tb[RTA_MAX+1];
2304        struct rtable *rt = NULL;
2305        struct flowi4 fl4;
2306        __be32 dst = 0;
2307        __be32 src = 0;
2308        u32 iif;
2309        int err;
2310        int mark;
2311        struct sk_buff *skb;
2312
2313        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2314        if (err < 0)
2315                goto errout;
2316
2317        rtm = nlmsg_data(nlh);
2318
2319        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2320        if (skb == NULL) {
2321                err = -ENOBUFS;
2322                goto errout;
2323        }
2324
2325        /* Reserve room for dummy headers, this skb can pass
2326           through good chunk of routing engine.
2327         */
2328        skb_reset_mac_header(skb);
2329        skb_reset_network_header(skb);
2330
2331        /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2332        ip_hdr(skb)->protocol = IPPROTO_ICMP;
2333        skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2334
2335        src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2336        dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2337        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2338        mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2339
2340        memset(&fl4, 0, sizeof(fl4));
2341        fl4.daddr = dst;
2342        fl4.saddr = src;
2343        fl4.flowi4_tos = rtm->rtm_tos;
2344        fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2345        fl4.flowi4_mark = mark;
2346
2347        if (iif) {
2348                struct net_device *dev;
2349
2350                dev = __dev_get_by_index(net, iif);
2351                if (dev == NULL) {
2352                        err = -ENODEV;
2353                        goto errout_free;
2354                }
2355
2356                skb->protocol   = htons(ETH_P_IP);
2357                skb->dev        = dev;
2358                skb->mark       = mark;
2359                local_bh_disable();
2360                err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2361                local_bh_enable();
2362
2363                rt = skb_rtable(skb);
2364                if (err == 0 && rt->dst.error)
2365                        err = -rt->dst.error;
2366        } else {
2367                rt = ip_route_output_key(net, &fl4);
2368
2369                err = 0;
2370                if (IS_ERR(rt))
2371                        err = PTR_ERR(rt);
2372        }
2373
2374        if (err)
2375                goto errout_free;
2376
2377        skb_dst_set(skb, &rt->dst);
2378        if (rtm->rtm_flags & RTM_F_NOTIFY)
2379                rt->rt_flags |= RTCF_NOTIFY;
2380
2381        err = rt_fill_info(net, dst, src, &fl4, skb,
2382                           NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2383                           RTM_NEWROUTE, 0, 0);
2384        if (err <= 0)
2385                goto errout_free;
2386
2387        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2388errout:
2389        return err;
2390
2391errout_free:
2392        kfree_skb(skb);
2393        goto errout;
2394}
2395
2396int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2397{
2398        return skb->len;
2399}
2400
2401void ip_rt_multicast_event(struct in_device *in_dev)
2402{
2403        rt_cache_flush(dev_net(in_dev->dev));
2404}
2405
2406#ifdef CONFIG_SYSCTL
2407static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2408                                        void __user *buffer,
2409                                        size_t *lenp, loff_t *ppos)
2410{
2411        if (write) {
2412                rt_cache_flush((struct net *)__ctl->extra1);
2413                return 0;
2414        }
2415
2416        return -EINVAL;
2417}
2418
2419static ctl_table ipv4_route_table[] = {
2420        {
2421                .procname       = "gc_thresh",
2422                .data           = &ipv4_dst_ops.gc_thresh,
2423                .maxlen         = sizeof(int),
2424                .mode           = 0644,
2425                .proc_handler   = proc_dointvec,
2426        },
2427        {
2428                .procname       = "max_size",
2429                .data           = &ip_rt_max_size,
2430                .maxlen         = sizeof(int),
2431                .mode           = 0644,
2432                .proc_handler   = proc_dointvec,
2433        },
2434        {
2435                /*  Deprecated. Use gc_min_interval_ms */
2436
2437                .procname       = "gc_min_interval",
2438                .data           = &ip_rt_gc_min_interval,
2439                .maxlen         = sizeof(int),
2440                .mode           = 0644,
2441                .proc_handler   = proc_dointvec_jiffies,
2442        },
2443        {
2444                .procname       = "gc_min_interval_ms",
2445                .data           = &ip_rt_gc_min_interval,
2446                .maxlen         = sizeof(int),
2447                .mode           = 0644,
2448                .proc_handler   = proc_dointvec_ms_jiffies,
2449        },
2450        {
2451                .procname       = "gc_timeout",
2452                .data           = &ip_rt_gc_timeout,
2453                .maxlen         = sizeof(int),
2454                .mode           = 0644,
2455                .proc_handler   = proc_dointvec_jiffies,
2456        },
2457        {
2458                .procname       = "gc_interval",
2459                .data           = &ip_rt_gc_interval,
2460                .maxlen         = sizeof(int),
2461                .mode           = 0644,
2462                .proc_handler   = proc_dointvec_jiffies,
2463        },
2464        {
2465                .procname       = "redirect_load",
2466                .data           = &ip_rt_redirect_load,
2467                .maxlen         = sizeof(int),
2468                .mode           = 0644,
2469                .proc_handler   = proc_dointvec,
2470        },
2471        {
2472                .procname       = "redirect_number",
2473                .data           = &ip_rt_redirect_number,
2474                .maxlen         = sizeof(int),
2475                .mode           = 0644,
2476                .proc_handler   = proc_dointvec,
2477        },
2478        {
2479                .procname       = "redirect_silence",
2480                .data           = &ip_rt_redirect_silence,
2481                .maxlen         = sizeof(int),
2482                .mode           = 0644,
2483                .proc_handler   = proc_dointvec,
2484        },
2485        {
2486                .procname       = "error_cost",
2487                .data           = &ip_rt_error_cost,
2488                .maxlen         = sizeof(int),
2489                .mode           = 0644,
2490                .proc_handler   = proc_dointvec,
2491        },
2492        {
2493                .procname       = "error_burst",
2494                .data           = &ip_rt_error_burst,
2495                .maxlen         = sizeof(int),
2496                .mode           = 0644,
2497                .proc_handler   = proc_dointvec,
2498        },
2499        {
2500                .procname       = "gc_elasticity",
2501                .data           = &ip_rt_gc_elasticity,
2502                .maxlen         = sizeof(int),
2503                .mode           = 0644,
2504                .proc_handler   = proc_dointvec,
2505        },
2506        {
2507                .procname       = "mtu_expires",
2508                .data           = &ip_rt_mtu_expires,
2509                .maxlen         = sizeof(int),
2510                .mode           = 0644,
2511                .proc_handler   = proc_dointvec_jiffies,
2512        },
2513        {
2514                .procname       = "min_pmtu",
2515                .data           = &ip_rt_min_pmtu,
2516                .maxlen         = sizeof(int),
2517                .mode           = 0644,
2518                .proc_handler   = proc_dointvec,
2519        },
2520        {
2521                .procname       = "min_adv_mss",
2522                .data           = &ip_rt_min_advmss,
2523                .maxlen         = sizeof(int),
2524                .mode           = 0644,
2525                .proc_handler   = proc_dointvec,
2526        },
2527        { }
2528};
2529
2530static struct ctl_table ipv4_route_flush_table[] = {
2531        {
2532                .procname       = "flush",
2533                .maxlen         = sizeof(int),
2534                .mode           = 0200,
2535                .proc_handler   = ipv4_sysctl_rtcache_flush,
2536        },
2537        { },
2538};
2539
2540static __net_init int sysctl_route_net_init(struct net *net)
2541{
2542        struct ctl_table *tbl;
2543
2544        tbl = ipv4_route_flush_table;
2545        if (!net_eq(net, &init_net)) {
2546                tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2547                if (tbl == NULL)
2548                        goto err_dup;
2549        }
2550        tbl[0].extra1 = net;
2551
2552        net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2553        if (net->ipv4.route_hdr == NULL)
2554                goto err_reg;
2555        return 0;
2556
2557err_reg:
2558        if (tbl != ipv4_route_flush_table)
2559                kfree(tbl);
2560err_dup:
2561        return -ENOMEM;
2562}
2563
2564static __net_exit void sysctl_route_net_exit(struct net *net)
2565{
2566        struct ctl_table *tbl;
2567
2568        tbl = net->ipv4.route_hdr->ctl_table_arg;
2569        unregister_net_sysctl_table(net->ipv4.route_hdr);
2570        BUG_ON(tbl == ipv4_route_flush_table);
2571        kfree(tbl);
2572}
2573
2574static __net_initdata struct pernet_operations sysctl_route_ops = {
2575        .init = sysctl_route_net_init,
2576        .exit = sysctl_route_net_exit,
2577};
2578#endif
2579
2580static __net_init int rt_genid_init(struct net *net)
2581{
2582        atomic_set(&net->rt_genid, 0);
2583        get_random_bytes(&net->ipv4.dev_addr_genid,
2584                         sizeof(net->ipv4.dev_addr_genid));
2585        return 0;
2586}
2587
2588static __net_initdata struct pernet_operations rt_genid_ops = {
2589        .init = rt_genid_init,
2590};
2591
2592static int __net_init ipv4_inetpeer_init(struct net *net)
2593{
2594        struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2595
2596        if (!bp)
2597                return -ENOMEM;
2598        inet_peer_base_init(bp);
2599        net->ipv4.peers = bp;
2600        return 0;
2601}
2602
2603static void __net_exit ipv4_inetpeer_exit(struct net *net)
2604{
2605        struct inet_peer_base *bp = net->ipv4.peers;
2606
2607        net->ipv4.peers = NULL;
2608        inetpeer_invalidate_tree(bp);
2609        kfree(bp);
2610}
2611
2612static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2613        .init   =       ipv4_inetpeer_init,
2614        .exit   =       ipv4_inetpeer_exit,
2615};
2616
2617#ifdef CONFIG_IP_ROUTE_CLASSID
2618struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2619#endif /* CONFIG_IP_ROUTE_CLASSID */
2620
2621int __init ip_rt_init(void)
2622{
2623        int rc = 0;
2624
2625#ifdef CONFIG_IP_ROUTE_CLASSID
2626        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2627        if (!ip_rt_acct)
2628                panic("IP: failed to allocate ip_rt_acct\n");
2629#endif
2630
2631        ipv4_dst_ops.kmem_cachep =
2632                kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2633                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2634
2635        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2636
2637        if (dst_entries_init(&ipv4_dst_ops) < 0)
2638                panic("IP: failed to allocate ipv4_dst_ops counter\n");
2639
2640        if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2641                panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2642
2643        ipv4_dst_ops.gc_thresh = ~0;
2644        ip_rt_max_size = INT_MAX;
2645
2646        devinet_init();
2647        ip_fib_init();
2648
2649        if (ip_rt_proc_init())
2650                pr_err("Unable to create route proc files\n");
2651#ifdef CONFIG_XFRM
2652        xfrm_init();
2653        xfrm4_init();
2654#endif
2655        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2656
2657#ifdef CONFIG_SYSCTL
2658        register_pernet_subsys(&sysctl_route_ops);
2659#endif
2660        register_pernet_subsys(&rt_genid_ops);
2661        register_pernet_subsys(&ipv4_inetpeer_ops);
2662        return rc;
2663}
2664
2665#ifdef CONFIG_SYSCTL
2666/*
2667 * We really need to sanitize the damn ipv4 init order, then all
2668 * this nonsense will go away.
2669 */
2670void __init ip_static_sysctl_init(void)
2671{
2672        register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2673}
2674#endif
2675
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.