linux/net/ipv4/route.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              ROUTE - implementation of the IP router.
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13 *
  14 * Fixes:
  15 *              Alan Cox        :       Verify area fixes.
  16 *              Alan Cox        :       cli() protects routing changes
  17 *              Rui Oliveira    :       ICMP routing table updates
  18 *              (rco@di.uminho.pt)      Routing table insertion and update
  19 *              Linus Torvalds  :       Rewrote bits to be sensible
  20 *              Alan Cox        :       Added BSD route gw semantics
  21 *              Alan Cox        :       Super /proc >4K
  22 *              Alan Cox        :       MTU in route table
  23 *              Alan Cox        :       MSS actually. Also added the window
  24 *                                      clamper.
  25 *              Sam Lantinga    :       Fixed route matching in rt_del()
  26 *              Alan Cox        :       Routing cache support.
  27 *              Alan Cox        :       Removed compatibility cruft.
  28 *              Alan Cox        :       RTF_REJECT support.
  29 *              Alan Cox        :       TCP irtt support.
  30 *              Jonathan Naylor :       Added Metric support.
  31 *      Miquel van Smoorenburg  :       BSD API fixes.
  32 *      Miquel van Smoorenburg  :       Metrics.
  33 *              Alan Cox        :       Use __u32 properly
  34 *              Alan Cox        :       Aligned routing errors more closely with BSD
  35 *                                      our system is still very different.
  36 *              Alan Cox        :       Faster /proc handling
  37 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38 *                                      routing caches and better behaviour.
  39 *
  40 *              Olaf Erb        :       irtt wasn't being copied right.
  41 *              Bjorn Ekwall    :       Kerneld route support.
  42 *              Alan Cox        :       Multicast fixed (I hope)
  43 *              Pavel Krauz     :       Limited broadcast fixed
  44 *              Mike McLagan    :       Routing by source
  45 *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46 *                                      route.c and rewritten from scratch.
  47 *              Andi Kleen      :       Load-limit warning messages.
  48 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52 *              Marc Boucher    :       routing by fwmark
  53 *      Robert Olsson           :       Added rt_cache statistics
  54 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55 *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56 *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57 *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58 *
  59 *              This program is free software; you can redistribute it and/or
  60 *              modify it under the terms of the GNU General Public License
  61 *              as published by the Free Software Foundation; either version
  62 *              2 of the License, or (at your option) any later version.
  63 */
  64
  65#define pr_fmt(fmt) "IPv4: " fmt
  66
  67#include <linux/module.h>
  68#include <asm/uaccess.h>
  69#include <linux/bitops.h>
  70#include <linux/types.h>
  71#include <linux/kernel.h>
  72#include <linux/mm.h>
  73#include <linux/string.h>
  74#include <linux/socket.h>
  75#include <linux/sockios.h>
  76#include <linux/errno.h>
  77#include <linux/in.h>
  78#include <linux/inet.h>
  79#include <linux/netdevice.h>
  80#include <linux/proc_fs.h>
  81#include <linux/init.h>
  82#include <linux/skbuff.h>
  83#include <linux/inetdevice.h>
  84#include <linux/igmp.h>
  85#include <linux/pkt_sched.h>
  86#include <linux/mroute.h>
  87#include <linux/netfilter_ipv4.h>
  88#include <linux/random.h>
  89#include <linux/rcupdate.h>
  90#include <linux/times.h>
  91#include <linux/slab.h>
  92#include <net/dst.h>
  93#include <net/net_namespace.h>
  94#include <net/protocol.h>
  95#include <net/ip.h>
  96#include <net/route.h>
  97#include <net/inetpeer.h>
  98#include <net/sock.h>
  99#include <net/ip_fib.h>
 100#include <net/arp.h>
 101#include <net/tcp.h>
 102#include <net/icmp.h>
 103#include <net/xfrm.h>
 104#include <net/netevent.h>
 105#include <net/rtnetlink.h>
 106#ifdef CONFIG_SYSCTL
 107#include <linux/sysctl.h>
 108#include <linux/kmemleak.h>
 109#endif
 110#include <net/secure_seq.h>
 111
 112#define RT_FL_TOS(oldflp4) \
 113        ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 114
 115#define IP_MAX_MTU      0xFFF0
 116
 117#define RT_GC_TIMEOUT (300*HZ)
 118
 119static int ip_rt_max_size;
 120static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 121static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 122static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 123static int ip_rt_redirect_number __read_mostly  = 9;
 124static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 125static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 126static int ip_rt_error_cost __read_mostly       = HZ;
 127static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 128static int ip_rt_gc_elasticity __read_mostly    = 8;
 129static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 130static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 131static int ip_rt_min_advmss __read_mostly       = 256;
 132
 133/*
 134 *      Interface to generic destination cache.
 135 */
 136
 137static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 138static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 139static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 140static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 141static void              ipv4_link_failure(struct sk_buff *skb);
 142static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 143                                           struct sk_buff *skb, u32 mtu);
 144static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 145                                        struct sk_buff *skb);
 146static void             ipv4_dst_destroy(struct dst_entry *dst);
 147
 148static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 149                            int how)
 150{
 151}
 152
 153static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 154{
 155        WARN_ON(1);
 156        return NULL;
 157}
 158
 159static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 160                                           struct sk_buff *skb,
 161                                           const void *daddr);
 162
 163static struct dst_ops ipv4_dst_ops = {
 164        .family =               AF_INET,
 165        .protocol =             cpu_to_be16(ETH_P_IP),
 166        .check =                ipv4_dst_check,
 167        .default_advmss =       ipv4_default_advmss,
 168        .mtu =                  ipv4_mtu,
 169        .cow_metrics =          ipv4_cow_metrics,
 170        .destroy =              ipv4_dst_destroy,
 171        .ifdown =               ipv4_dst_ifdown,
 172        .negative_advice =      ipv4_negative_advice,
 173        .link_failure =         ipv4_link_failure,
 174        .update_pmtu =          ip_rt_update_pmtu,
 175        .redirect =             ip_do_redirect,
 176        .local_out =            __ip_local_out,
 177        .neigh_lookup =         ipv4_neigh_lookup,
 178};
 179
 180#define ECN_OR_COST(class)      TC_PRIO_##class
 181
 182const __u8 ip_tos2prio[16] = {
 183        TC_PRIO_BESTEFFORT,
 184        ECN_OR_COST(BESTEFFORT),
 185        TC_PRIO_BESTEFFORT,
 186        ECN_OR_COST(BESTEFFORT),
 187        TC_PRIO_BULK,
 188        ECN_OR_COST(BULK),
 189        TC_PRIO_BULK,
 190        ECN_OR_COST(BULK),
 191        TC_PRIO_INTERACTIVE,
 192        ECN_OR_COST(INTERACTIVE),
 193        TC_PRIO_INTERACTIVE,
 194        ECN_OR_COST(INTERACTIVE),
 195        TC_PRIO_INTERACTIVE_BULK,
 196        ECN_OR_COST(INTERACTIVE_BULK),
 197        TC_PRIO_INTERACTIVE_BULK,
 198        ECN_OR_COST(INTERACTIVE_BULK)
 199};
 200EXPORT_SYMBOL(ip_tos2prio);
 201
 202static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 203#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 204
 205#ifdef CONFIG_PROC_FS
 206static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 207{
 208        if (*pos)
 209                return NULL;
 210        return SEQ_START_TOKEN;
 211}
 212
 213static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 214{
 215        ++*pos;
 216        return NULL;
 217}
 218
 219static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 220{
 221}
 222
 223static int rt_cache_seq_show(struct seq_file *seq, void *v)
 224{
 225        if (v == SEQ_START_TOKEN)
 226                seq_printf(seq, "%-127s\n",
 227                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 228                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 229                           "HHUptod\tSpecDst");
 230        return 0;
 231}
 232
 233static const struct seq_operations rt_cache_seq_ops = {
 234        .start  = rt_cache_seq_start,
 235        .next   = rt_cache_seq_next,
 236        .stop   = rt_cache_seq_stop,
 237        .show   = rt_cache_seq_show,
 238};
 239
 240static int rt_cache_seq_open(struct inode *inode, struct file *file)
 241{
 242        return seq_open(file, &rt_cache_seq_ops);
 243}
 244
 245static const struct file_operations rt_cache_seq_fops = {
 246        .owner   = THIS_MODULE,
 247        .open    = rt_cache_seq_open,
 248        .read    = seq_read,
 249        .llseek  = seq_lseek,
 250        .release = seq_release,
 251};
 252
 253
 254static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 255{
 256        int cpu;
 257
 258        if (*pos == 0)
 259                return SEQ_START_TOKEN;
 260
 261        for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 262                if (!cpu_possible(cpu))
 263                        continue;
 264                *pos = cpu+1;
 265                return &per_cpu(rt_cache_stat, cpu);
 266        }
 267        return NULL;
 268}
 269
 270static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 271{
 272        int cpu;
 273
 274        for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 275                if (!cpu_possible(cpu))
 276                        continue;
 277                *pos = cpu+1;
 278                return &per_cpu(rt_cache_stat, cpu);
 279        }
 280        return NULL;
 281
 282}
 283
 284static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 285{
 286
 287}
 288
 289static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 290{
 291        struct rt_cache_stat *st = v;
 292
 293        if (v == SEQ_START_TOKEN) {
 294                seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 295                return 0;
 296        }
 297
 298        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 299                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 300                   dst_entries_get_slow(&ipv4_dst_ops),
 301                   st->in_hit,
 302                   st->in_slow_tot,
 303                   st->in_slow_mc,
 304                   st->in_no_route,
 305                   st->in_brd,
 306                   st->in_martian_dst,
 307                   st->in_martian_src,
 308
 309                   st->out_hit,
 310                   st->out_slow_tot,
 311                   st->out_slow_mc,
 312
 313                   st->gc_total,
 314                   st->gc_ignored,
 315                   st->gc_goal_miss,
 316                   st->gc_dst_overflow,
 317                   st->in_hlist_search,
 318                   st->out_hlist_search
 319                );
 320        return 0;
 321}
 322
 323static const struct seq_operations rt_cpu_seq_ops = {
 324        .start  = rt_cpu_seq_start,
 325        .next   = rt_cpu_seq_next,
 326        .stop   = rt_cpu_seq_stop,
 327        .show   = rt_cpu_seq_show,
 328};
 329
 330
 331static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 332{
 333        return seq_open(file, &rt_cpu_seq_ops);
 334}
 335
 336static const struct file_operations rt_cpu_seq_fops = {
 337        .owner   = THIS_MODULE,
 338        .open    = rt_cpu_seq_open,
 339        .read    = seq_read,
 340        .llseek  = seq_lseek,
 341        .release = seq_release,
 342};
 343
 344#ifdef CONFIG_IP_ROUTE_CLASSID
 345static int rt_acct_proc_show(struct seq_file *m, void *v)
 346{
 347        struct ip_rt_acct *dst, *src;
 348        unsigned int i, j;
 349
 350        dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 351        if (!dst)
 352                return -ENOMEM;
 353
 354        for_each_possible_cpu(i) {
 355                src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 356                for (j = 0; j < 256; j++) {
 357                        dst[j].o_bytes   += src[j].o_bytes;
 358                        dst[j].o_packets += src[j].o_packets;
 359                        dst[j].i_bytes   += src[j].i_bytes;
 360                        dst[j].i_packets += src[j].i_packets;
 361                }
 362        }
 363
 364        seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 365        kfree(dst);
 366        return 0;
 367}
 368
 369static int rt_acct_proc_open(struct inode *inode, struct file *file)
 370{
 371        return single_open(file, rt_acct_proc_show, NULL);
 372}
 373
 374static const struct file_operations rt_acct_proc_fops = {
 375        .owner          = THIS_MODULE,
 376        .open           = rt_acct_proc_open,
 377        .read           = seq_read,
 378        .llseek         = seq_lseek,
 379        .release        = single_release,
 380};
 381#endif
 382
 383static int __net_init ip_rt_do_proc_init(struct net *net)
 384{
 385        struct proc_dir_entry *pde;
 386
 387        pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 388                        &rt_cache_seq_fops);
 389        if (!pde)
 390                goto err1;
 391
 392        pde = proc_create("rt_cache", S_IRUGO,
 393                          net->proc_net_stat, &rt_cpu_seq_fops);
 394        if (!pde)
 395                goto err2;
 396
 397#ifdef CONFIG_IP_ROUTE_CLASSID
 398        pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 399        if (!pde)
 400                goto err3;
 401#endif
 402        return 0;
 403
 404#ifdef CONFIG_IP_ROUTE_CLASSID
 405err3:
 406        remove_proc_entry("rt_cache", net->proc_net_stat);
 407#endif
 408err2:
 409        remove_proc_entry("rt_cache", net->proc_net);
 410err1:
 411        return -ENOMEM;
 412}
 413
 414static void __net_exit ip_rt_do_proc_exit(struct net *net)
 415{
 416        remove_proc_entry("rt_cache", net->proc_net_stat);
 417        remove_proc_entry("rt_cache", net->proc_net);
 418#ifdef CONFIG_IP_ROUTE_CLASSID
 419        remove_proc_entry("rt_acct", net->proc_net);
 420#endif
 421}
 422
 423static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 424        .init = ip_rt_do_proc_init,
 425        .exit = ip_rt_do_proc_exit,
 426};
 427
 428static int __init ip_rt_proc_init(void)
 429{
 430        return register_pernet_subsys(&ip_rt_proc_ops);
 431}
 432
 433#else
 434static inline int ip_rt_proc_init(void)
 435{
 436        return 0;
 437}
 438#endif /* CONFIG_PROC_FS */
 439
 440static inline bool rt_is_expired(const struct rtable *rth)
 441{
 442        return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 443}
 444
 445void rt_cache_flush(struct net *net)
 446{
 447        rt_genid_bump(net);
 448}
 449
 450static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 451                                           struct sk_buff *skb,
 452                                           const void *daddr)
 453{
 454        struct net_device *dev = dst->dev;
 455        const __be32 *pkey = daddr;
 456        const struct rtable *rt;
 457        struct neighbour *n;
 458
 459        rt = (const struct rtable *) dst;
 460        if (rt->rt_gateway)
 461                pkey = (const __be32 *) &rt->rt_gateway;
 462        else if (skb)
 463                pkey = &ip_hdr(skb)->daddr;
 464
 465        n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 466        if (n)
 467                return n;
 468        return neigh_create(&arp_tbl, pkey, dev);
 469}
 470
 471/*
 472 * Peer allocation may fail only in serious out-of-memory conditions.  However
 473 * we still can generate some output.
 474 * Random ID selection looks a bit dangerous because we have no chances to
 475 * select ID being unique in a reasonable period of time.
 476 * But broken packet identifier may be better than no packet at all.
 477 */
 478static void ip_select_fb_ident(struct iphdr *iph)
 479{
 480        static DEFINE_SPINLOCK(ip_fb_id_lock);
 481        static u32 ip_fallback_id;
 482        u32 salt;
 483
 484        spin_lock_bh(&ip_fb_id_lock);
 485        salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
 486        iph->id = htons(salt & 0xFFFF);
 487        ip_fallback_id = salt;
 488        spin_unlock_bh(&ip_fb_id_lock);
 489}
 490
 491void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 492{
 493        struct net *net = dev_net(dst->dev);
 494        struct inet_peer *peer;
 495
 496        peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
 497        if (peer) {
 498                iph->id = htons(inet_getid(peer, more));
 499                inet_putpeer(peer);
 500                return;
 501        }
 502
 503        ip_select_fb_ident(iph);
 504}
 505EXPORT_SYMBOL(__ip_select_ident);
 506
 507static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 508                             const struct iphdr *iph,
 509                             int oif, u8 tos,
 510                             u8 prot, u32 mark, int flow_flags)
 511{
 512        if (sk) {
 513                const struct inet_sock *inet = inet_sk(sk);
 514
 515                oif = sk->sk_bound_dev_if;
 516                mark = sk->sk_mark;
 517                tos = RT_CONN_FLAGS(sk);
 518                prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 519        }
 520        flowi4_init_output(fl4, oif, mark, tos,
 521                           RT_SCOPE_UNIVERSE, prot,
 522                           flow_flags,
 523                           iph->daddr, iph->saddr, 0, 0);
 524}
 525
 526static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 527                               const struct sock *sk)
 528{
 529        const struct iphdr *iph = ip_hdr(skb);
 530        int oif = skb->dev->ifindex;
 531        u8 tos = RT_TOS(iph->tos);
 532        u8 prot = iph->protocol;
 533        u32 mark = skb->mark;
 534
 535        __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 536}
 537
 538static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 539{
 540        const struct inet_sock *inet = inet_sk(sk);
 541        const struct ip_options_rcu *inet_opt;
 542        __be32 daddr = inet->inet_daddr;
 543
 544        rcu_read_lock();
 545        inet_opt = rcu_dereference(inet->inet_opt);
 546        if (inet_opt && inet_opt->opt.srr)
 547                daddr = inet_opt->opt.faddr;
 548        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 549                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 550                           inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 551                           inet_sk_flowi_flags(sk),
 552                           daddr, inet->inet_saddr, 0, 0);
 553        rcu_read_unlock();
 554}
 555
 556static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 557                                 const struct sk_buff *skb)
 558{
 559        if (skb)
 560                build_skb_flow_key(fl4, skb, sk);
 561        else
 562                build_sk_flow_key(fl4, sk);
 563}
 564
 565static inline void rt_free(struct rtable *rt)
 566{
 567        call_rcu(&rt->dst.rcu_head, dst_rcu_free);
 568}
 569
 570static DEFINE_SPINLOCK(fnhe_lock);
 571
 572static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 573{
 574        struct fib_nh_exception *fnhe, *oldest;
 575        struct rtable *orig;
 576
 577        oldest = rcu_dereference(hash->chain);
 578        for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 579             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 580                if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 581                        oldest = fnhe;
 582        }
 583        orig = rcu_dereference(oldest->fnhe_rth);
 584        if (orig) {
 585                RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
 586                rt_free(orig);
 587        }
 588        return oldest;
 589}
 590
 591static inline u32 fnhe_hashfun(__be32 daddr)
 592{
 593        u32 hval;
 594
 595        hval = (__force u32) daddr;
 596        hval ^= (hval >> 11) ^ (hval >> 22);
 597
 598        return hval & (FNHE_HASH_SIZE - 1);
 599}
 600
 601static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 602                                  u32 pmtu, unsigned long expires)
 603{
 604        struct fnhe_hash_bucket *hash;
 605        struct fib_nh_exception *fnhe;
 606        int depth;
 607        u32 hval = fnhe_hashfun(daddr);
 608
 609        spin_lock_bh(&fnhe_lock);
 610
 611        hash = nh->nh_exceptions;
 612        if (!hash) {
 613                hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 614                if (!hash)
 615                        goto out_unlock;
 616                nh->nh_exceptions = hash;
 617        }
 618
 619        hash += hval;
 620
 621        depth = 0;
 622        for (fnhe = rcu_dereference(hash->chain); fnhe;
 623             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 624                if (fnhe->fnhe_daddr == daddr)
 625                        break;
 626                depth++;
 627        }
 628
 629        if (fnhe) {
 630                if (gw)
 631                        fnhe->fnhe_gw = gw;
 632                if (pmtu) {
 633                        fnhe->fnhe_pmtu = pmtu;
 634                        fnhe->fnhe_expires = expires;
 635                }
 636        } else {
 637                if (depth > FNHE_RECLAIM_DEPTH)
 638                        fnhe = fnhe_oldest(hash);
 639                else {
 640                        fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 641                        if (!fnhe)
 642                                goto out_unlock;
 643
 644                        fnhe->fnhe_next = hash->chain;
 645                        rcu_assign_pointer(hash->chain, fnhe);
 646                }
 647                fnhe->fnhe_daddr = daddr;
 648                fnhe->fnhe_gw = gw;
 649                fnhe->fnhe_pmtu = pmtu;
 650                fnhe->fnhe_expires = expires;
 651        }
 652
 653        fnhe->fnhe_stamp = jiffies;
 654
 655out_unlock:
 656        spin_unlock_bh(&fnhe_lock);
 657        return;
 658}
 659
 660static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 661                             bool kill_route)
 662{
 663        __be32 new_gw = icmp_hdr(skb)->un.gateway;
 664        __be32 old_gw = ip_hdr(skb)->saddr;
 665        struct net_device *dev = skb->dev;
 666        struct in_device *in_dev;
 667        struct fib_result res;
 668        struct neighbour *n;
 669        struct net *net;
 670
 671        switch (icmp_hdr(skb)->code & 7) {
 672        case ICMP_REDIR_NET:
 673        case ICMP_REDIR_NETTOS:
 674        case ICMP_REDIR_HOST:
 675        case ICMP_REDIR_HOSTTOS:
 676                break;
 677
 678        default:
 679                return;
 680        }
 681
 682        if (rt->rt_gateway != old_gw)
 683                return;
 684
 685        in_dev = __in_dev_get_rcu(dev);
 686        if (!in_dev)
 687                return;
 688
 689        net = dev_net(dev);
 690        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 691            ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 692            ipv4_is_zeronet(new_gw))
 693                goto reject_redirect;
 694
 695        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 696                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 697                        goto reject_redirect;
 698                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 699                        goto reject_redirect;
 700        } else {
 701                if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 702                        goto reject_redirect;
 703        }
 704
 705        n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
 706        if (n) {
 707                if (!(n->nud_state & NUD_VALID)) {
 708                        neigh_event_send(n, NULL);
 709                } else {
 710                        if (fib_lookup(net, fl4, &res) == 0) {
 711                                struct fib_nh *nh = &FIB_RES_NH(res);
 712
 713                                update_or_create_fnhe(nh, fl4->daddr, new_gw,
 714                                                      0, 0);
 715                        }
 716                        if (kill_route)
 717                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 718                        call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 719                }
 720                neigh_release(n);
 721        }
 722        return;
 723
 724reject_redirect:
 725#ifdef CONFIG_IP_ROUTE_VERBOSE
 726        if (IN_DEV_LOG_MARTIANS(in_dev)) {
 727                const struct iphdr *iph = (const struct iphdr *) skb->data;
 728                __be32 daddr = iph->daddr;
 729                __be32 saddr = iph->saddr;
 730
 731                net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 732                                     "  Advised path = %pI4 -> %pI4\n",
 733                                     &old_gw, dev->name, &new_gw,
 734                                     &saddr, &daddr);
 735        }
 736#endif
 737        ;
 738}
 739
 740static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 741{
 742        struct rtable *rt;
 743        struct flowi4 fl4;
 744
 745        rt = (struct rtable *) dst;
 746
 747        ip_rt_build_flow_key(&fl4, sk, skb);
 748        __ip_do_redirect(rt, skb, &fl4, true);
 749}
 750
 751static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 752{
 753        struct rtable *rt = (struct rtable *)dst;
 754        struct dst_entry *ret = dst;
 755
 756        if (rt) {
 757                if (dst->obsolete > 0) {
 758                        ip_rt_put(rt);
 759                        ret = NULL;
 760                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 761                           rt->dst.expires) {
 762                        ip_rt_put(rt);
 763                        ret = NULL;
 764                }
 765        }
 766        return ret;
 767}
 768
 769/*
 770 * Algorithm:
 771 *      1. The first ip_rt_redirect_number redirects are sent
 772 *         with exponential backoff, then we stop sending them at all,
 773 *         assuming that the host ignores our redirects.
 774 *      2. If we did not see packets requiring redirects
 775 *         during ip_rt_redirect_silence, we assume that the host
 776 *         forgot redirected route and start to send redirects again.
 777 *
 778 * This algorithm is much cheaper and more intelligent than dumb load limiting
 779 * in icmp.c.
 780 *
 781 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 782 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 783 */
 784
 785void ip_rt_send_redirect(struct sk_buff *skb)
 786{
 787        struct rtable *rt = skb_rtable(skb);
 788        struct in_device *in_dev;
 789        struct inet_peer *peer;
 790        struct net *net;
 791        int log_martians;
 792
 793        rcu_read_lock();
 794        in_dev = __in_dev_get_rcu(rt->dst.dev);
 795        if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 796                rcu_read_unlock();
 797                return;
 798        }
 799        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 800        rcu_read_unlock();
 801
 802        net = dev_net(rt->dst.dev);
 803        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 804        if (!peer) {
 805                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 806                          rt_nexthop(rt, ip_hdr(skb)->daddr));
 807                return;
 808        }
 809
 810        /* No redirected packets during ip_rt_redirect_silence;
 811         * reset the algorithm.
 812         */
 813        if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 814                peer->rate_tokens = 0;
 815
 816        /* Too many ignored redirects; do not send anything
 817         * set dst.rate_last to the last seen redirected packet.
 818         */
 819        if (peer->rate_tokens >= ip_rt_redirect_number) {
 820                peer->rate_last = jiffies;
 821                goto out_put_peer;
 822        }
 823
 824        /* Check for load limit; set rate_last to the latest sent
 825         * redirect.
 826         */
 827        if (peer->rate_tokens == 0 ||
 828            time_after(jiffies,
 829                       (peer->rate_last +
 830                        (ip_rt_redirect_load << peer->rate_tokens)))) {
 831                __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 832
 833                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 834                peer->rate_last = jiffies;
 835                ++peer->rate_tokens;
 836#ifdef CONFIG_IP_ROUTE_VERBOSE
 837                if (log_martians &&
 838                    peer->rate_tokens == ip_rt_redirect_number)
 839                        net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 840                                             &ip_hdr(skb)->saddr, inet_iif(skb),
 841                                             &ip_hdr(skb)->daddr, &gw);
 842#endif
 843        }
 844out_put_peer:
 845        inet_putpeer(peer);
 846}
 847
 848static int ip_error(struct sk_buff *skb)
 849{
 850        struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 851        struct rtable *rt = skb_rtable(skb);
 852        struct inet_peer *peer;
 853        unsigned long now;
 854        struct net *net;
 855        bool send;
 856        int code;
 857
 858        net = dev_net(rt->dst.dev);
 859        if (!IN_DEV_FORWARD(in_dev)) {
 860                switch (rt->dst.error) {
 861                case EHOSTUNREACH:
 862                        IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
 863                        break;
 864
 865                case ENETUNREACH:
 866                        IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 867                        break;
 868                }
 869                goto out;
 870        }
 871
 872        switch (rt->dst.error) {
 873        case EINVAL:
 874        default:
 875                goto out;
 876        case EHOSTUNREACH:
 877                code = ICMP_HOST_UNREACH;
 878                break;
 879        case ENETUNREACH:
 880                code = ICMP_NET_UNREACH;
 881                IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 882                break;
 883        case EACCES:
 884                code = ICMP_PKT_FILTERED;
 885                break;
 886        }
 887
 888        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 889
 890        send = true;
 891        if (peer) {
 892                now = jiffies;
 893                peer->rate_tokens += now - peer->rate_last;
 894                if (peer->rate_tokens > ip_rt_error_burst)
 895                        peer->rate_tokens = ip_rt_error_burst;
 896                peer->rate_last = now;
 897                if (peer->rate_tokens >= ip_rt_error_cost)
 898                        peer->rate_tokens -= ip_rt_error_cost;
 899                else
 900                        send = false;
 901                inet_putpeer(peer);
 902        }
 903        if (send)
 904                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 905
 906out:    kfree_skb(skb);
 907        return 0;
 908}
 909
 910static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 911{
 912        struct dst_entry *dst = &rt->dst;
 913        struct fib_result res;
 914
 915        if (dst->dev->mtu < mtu)
 916                return;
 917
 918        if (mtu < ip_rt_min_pmtu)
 919                mtu = ip_rt_min_pmtu;
 920
 921        if (!rt->rt_pmtu) {
 922                dst->obsolete = DST_OBSOLETE_KILL;
 923        } else {
 924                rt->rt_pmtu = mtu;
 925                dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
 926        }
 927
 928        rcu_read_lock();
 929        if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
 930                struct fib_nh *nh = &FIB_RES_NH(res);
 931
 932                update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
 933                                      jiffies + ip_rt_mtu_expires);
 934        }
 935        rcu_read_unlock();
 936}
 937
 938static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 939                              struct sk_buff *skb, u32 mtu)
 940{
 941        struct rtable *rt = (struct rtable *) dst;
 942        struct flowi4 fl4;
 943
 944        ip_rt_build_flow_key(&fl4, sk, skb);
 945        __ip_rt_update_pmtu(rt, &fl4, mtu);
 946}
 947
 948void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
 949                      int oif, u32 mark, u8 protocol, int flow_flags)
 950{
 951        const struct iphdr *iph = (const struct iphdr *) skb->data;
 952        struct flowi4 fl4;
 953        struct rtable *rt;
 954
 955        __build_flow_key(&fl4, NULL, iph, oif,
 956                         RT_TOS(iph->tos), protocol, mark, flow_flags);
 957        rt = __ip_route_output_key(net, &fl4);
 958        if (!IS_ERR(rt)) {
 959                __ip_rt_update_pmtu(rt, &fl4, mtu);
 960                ip_rt_put(rt);
 961        }
 962}
 963EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
 964
 965void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 966{
 967        const struct iphdr *iph = (const struct iphdr *) skb->data;
 968        struct flowi4 fl4;
 969        struct rtable *rt;
 970
 971        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
 972        rt = __ip_route_output_key(sock_net(sk), &fl4);
 973        if (!IS_ERR(rt)) {
 974                __ip_rt_update_pmtu(rt, &fl4, mtu);
 975                ip_rt_put(rt);
 976        }
 977}
 978EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
 979
 980void ipv4_redirect(struct sk_buff *skb, struct net *net,
 981                   int oif, u32 mark, u8 protocol, int flow_flags)
 982{
 983        const struct iphdr *iph = (const struct iphdr *) skb->data;
 984        struct flowi4 fl4;
 985        struct rtable *rt;
 986
 987        __build_flow_key(&fl4, NULL, iph, oif,
 988                         RT_TOS(iph->tos), protocol, mark, flow_flags);
 989        rt = __ip_route_output_key(net, &fl4);
 990        if (!IS_ERR(rt)) {
 991                __ip_do_redirect(rt, skb, &fl4, false);
 992                ip_rt_put(rt);
 993        }
 994}
 995EXPORT_SYMBOL_GPL(ipv4_redirect);
 996
 997void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
 998{
 999        const struct iphdr *iph = (const struct iphdr *) skb->data;
1000        struct flowi4 fl4;
1001        struct rtable *rt;
1002
1003        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1004        rt = __ip_route_output_key(sock_net(sk), &fl4);
1005        if (!IS_ERR(rt)) {
1006                __ip_do_redirect(rt, skb, &fl4, false);
1007                ip_rt_put(rt);
1008        }
1009}
1010EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1011
1012static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1013{
1014        struct rtable *rt = (struct rtable *) dst;
1015
1016        /* All IPV4 dsts are created with ->obsolete set to the value
1017         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1018         * into this function always.
1019         *
1020         * When a PMTU/redirect information update invalidates a
1021         * route, this is indicated by setting obsolete to
1022         * DST_OBSOLETE_KILL.
1023         */
1024        if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1025                return NULL;
1026        return dst;
1027}
1028
1029static void ipv4_link_failure(struct sk_buff *skb)
1030{
1031        struct rtable *rt;
1032
1033        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1034
1035        rt = skb_rtable(skb);
1036        if (rt)
1037                dst_set_expires(&rt->dst, 0);
1038}
1039
1040static int ip_rt_bug(struct sk_buff *skb)
1041{
1042        pr_debug("%s: %pI4 -> %pI4, %s\n",
1043                 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1044                 skb->dev ? skb->dev->name : "?");
1045        kfree_skb(skb);
1046        WARN_ON(1);
1047        return 0;
1048}
1049
1050/*
1051   We do not cache source address of outgoing interface,
1052   because it is used only by IP RR, TS and SRR options,
1053   so that it out of fast path.
1054
1055   BTW remember: "addr" is allowed to be not aligned
1056   in IP options!
1057 */
1058
1059void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1060{
1061        __be32 src;
1062
1063        if (rt_is_output_route(rt))
1064                src = ip_hdr(skb)->saddr;
1065        else {
1066                struct fib_result res;
1067                struct flowi4 fl4;
1068                struct iphdr *iph;
1069
1070                iph = ip_hdr(skb);
1071
1072                memset(&fl4, 0, sizeof(fl4));
1073                fl4.daddr = iph->daddr;
1074                fl4.saddr = iph->saddr;
1075                fl4.flowi4_tos = RT_TOS(iph->tos);
1076                fl4.flowi4_oif = rt->dst.dev->ifindex;
1077                fl4.flowi4_iif = skb->dev->ifindex;
1078                fl4.flowi4_mark = skb->mark;
1079
1080                rcu_read_lock();
1081                if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1082                        src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1083                else
1084                        src = inet_select_addr(rt->dst.dev,
1085                                               rt_nexthop(rt, iph->daddr),
1086                                               RT_SCOPE_UNIVERSE);
1087                rcu_read_unlock();
1088        }
1089        memcpy(addr, &src, 4);
1090}
1091
1092#ifdef CONFIG_IP_ROUTE_CLASSID
1093static void set_class_tag(struct rtable *rt, u32 tag)
1094{
1095        if (!(rt->dst.tclassid & 0xFFFF))
1096                rt->dst.tclassid |= tag & 0xFFFF;
1097        if (!(rt->dst.tclassid & 0xFFFF0000))
1098                rt->dst.tclassid |= tag & 0xFFFF0000;
1099}
1100#endif
1101
1102static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1103{
1104        unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1105
1106        if (advmss == 0) {
1107                advmss = max_t(unsigned int, dst->dev->mtu - 40,
1108                               ip_rt_min_advmss);
1109                if (advmss > 65535 - 40)
1110                        advmss = 65535 - 40;
1111        }
1112        return advmss;
1113}
1114
1115static unsigned int ipv4_mtu(const struct dst_entry *dst)
1116{
1117        const struct rtable *rt = (const struct rtable *) dst;
1118        unsigned int mtu = rt->rt_pmtu;
1119
1120        if (mtu && time_after_eq(jiffies, rt->dst.expires))
1121                mtu = 0;
1122
1123        if (!mtu)
1124                mtu = dst_metric_raw(dst, RTAX_MTU);
1125
1126        if (mtu && rt_is_output_route(rt))
1127                return mtu;
1128
1129        mtu = dst->dev->mtu;
1130
1131        if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1132                if (rt->rt_uses_gateway && mtu > 576)
1133                        mtu = 576;
1134        }
1135
1136        if (mtu > IP_MAX_MTU)
1137                mtu = IP_MAX_MTU;
1138
1139        return mtu;
1140}
1141
1142static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1143{
1144        struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1145        struct fib_nh_exception *fnhe;
1146        u32 hval;
1147
1148        if (!hash)
1149                return NULL;
1150
1151        hval = fnhe_hashfun(daddr);
1152
1153        for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1154             fnhe = rcu_dereference(fnhe->fnhe_next)) {
1155                if (fnhe->fnhe_daddr == daddr)
1156                        return fnhe;
1157        }
1158        return NULL;
1159}
1160
1161static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1162                              __be32 daddr)
1163{
1164        bool ret = false;
1165
1166        spin_lock_bh(&fnhe_lock);
1167
1168        if (daddr == fnhe->fnhe_daddr) {
1169                struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
1170                if (orig && rt_is_expired(orig)) {
1171                        fnhe->fnhe_gw = 0;
1172                        fnhe->fnhe_pmtu = 0;
1173                        fnhe->fnhe_expires = 0;
1174                }
1175                if (fnhe->fnhe_pmtu) {
1176                        unsigned long expires = fnhe->fnhe_expires;
1177                        unsigned long diff = expires - jiffies;
1178
1179                        if (time_before(jiffies, expires)) {
1180                                rt->rt_pmtu = fnhe->fnhe_pmtu;
1181                                dst_set_expires(&rt->dst, diff);
1182                        }
1183                }
1184                if (fnhe->fnhe_gw) {
1185                        rt->rt_flags |= RTCF_REDIRECTED;
1186                        rt->rt_gateway = fnhe->fnhe_gw;
1187                        rt->rt_uses_gateway = 1;
1188                } else if (!rt->rt_gateway)
1189                        rt->rt_gateway = daddr;
1190
1191                rcu_assign_pointer(fnhe->fnhe_rth, rt);
1192                if (orig)
1193                        rt_free(orig);
1194
1195                fnhe->fnhe_stamp = jiffies;
1196                ret = true;
1197        }
1198        spin_unlock_bh(&fnhe_lock);
1199
1200        return ret;
1201}
1202
1203static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1204{
1205        struct rtable *orig, *prev, **p;
1206        bool ret = true;
1207
1208        if (rt_is_input_route(rt)) {
1209                p = (struct rtable **)&nh->nh_rth_input;
1210        } else {
1211                p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1212        }
1213        orig = *p;
1214
1215        prev = cmpxchg(p, orig, rt);
1216        if (prev == orig) {
1217                if (orig)
1218                        rt_free(orig);
1219        } else
1220                ret = false;
1221
1222        return ret;
1223}
1224
1225static DEFINE_SPINLOCK(rt_uncached_lock);
1226static LIST_HEAD(rt_uncached_list);
1227
1228static void rt_add_uncached_list(struct rtable *rt)
1229{
1230        spin_lock_bh(&rt_uncached_lock);
1231        list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1232        spin_unlock_bh(&rt_uncached_lock);
1233}
1234
1235static void ipv4_dst_destroy(struct dst_entry *dst)
1236{
1237        struct rtable *rt = (struct rtable *) dst;
1238
1239        if (!list_empty(&rt->rt_uncached)) {
1240                spin_lock_bh(&rt_uncached_lock);
1241                list_del(&rt->rt_uncached);
1242                spin_unlock_bh(&rt_uncached_lock);
1243        }
1244}
1245
1246void rt_flush_dev(struct net_device *dev)
1247{
1248        if (!list_empty(&rt_uncached_list)) {
1249                struct net *net = dev_net(dev);
1250                struct rtable *rt;
1251
1252                spin_lock_bh(&rt_uncached_lock);
1253                list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1254                        if (rt->dst.dev != dev)
1255                                continue;
1256                        rt->dst.dev = net->loopback_dev;
1257                        dev_hold(rt->dst.dev);
1258                        dev_put(dev);
1259                }
1260                spin_unlock_bh(&rt_uncached_lock);
1261        }
1262}
1263
1264static bool rt_cache_valid(const struct rtable *rt)
1265{
1266        return  rt &&
1267                rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1268                !rt_is_expired(rt);
1269}
1270
1271static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1272                           const struct fib_result *res,
1273                           struct fib_nh_exception *fnhe,
1274                           struct fib_info *fi, u16 type, u32 itag)
1275{
1276        bool cached = false;
1277
1278        if (fi) {
1279                struct fib_nh *nh = &FIB_RES_NH(*res);
1280
1281                if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1282                        rt->rt_gateway = nh->nh_gw;
1283                        rt->rt_uses_gateway = 1;
1284                }
1285                dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1286#ifdef CONFIG_IP_ROUTE_CLASSID
1287                rt->dst.tclassid = nh->nh_tclassid;
1288#endif
1289                if (unlikely(fnhe))
1290                        cached = rt_bind_exception(rt, fnhe, daddr);
1291                else if (!(rt->dst.flags & DST_NOCACHE))
1292                        cached = rt_cache_route(nh, rt);
1293                if (unlikely(!cached)) {
1294                        /* Routes we intend to cache in nexthop exception or
1295                         * FIB nexthop have the DST_NOCACHE bit clear.
1296                         * However, if we are unsuccessful at storing this
1297                         * route into the cache we really need to set it.
1298                         */
1299                        rt->dst.flags |= DST_NOCACHE;
1300                        if (!rt->rt_gateway)
1301                                rt->rt_gateway = daddr;
1302                        rt_add_uncached_list(rt);
1303                }
1304        } else
1305                rt_add_uncached_list(rt);
1306
1307#ifdef CONFIG_IP_ROUTE_CLASSID
1308#ifdef CONFIG_IP_MULTIPLE_TABLES
1309        set_class_tag(rt, res->tclassid);
1310#endif
1311        set_class_tag(rt, itag);
1312#endif
1313}
1314
1315static struct rtable *rt_dst_alloc(struct net_device *dev,
1316                                   bool nopolicy, bool noxfrm, bool will_cache)
1317{
1318        return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1319                         (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1320                         (nopolicy ? DST_NOPOLICY : 0) |
1321                         (noxfrm ? DST_NOXFRM : 0));
1322}
1323
1324/* called in rcu_read_lock() section */
1325static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1326                                u8 tos, struct net_device *dev, int our)
1327{
1328        struct rtable *rth;
1329        struct in_device *in_dev = __in_dev_get_rcu(dev);
1330        u32 itag = 0;
1331        int err;
1332
1333        /* Primary sanity checks. */
1334
1335        if (in_dev == NULL)
1336                return -EINVAL;
1337
1338        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1339            skb->protocol != htons(ETH_P_IP))
1340                goto e_inval;
1341
1342        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1343                if (ipv4_is_loopback(saddr))
1344                        goto e_inval;
1345
1346        if (ipv4_is_zeronet(saddr)) {
1347                if (!ipv4_is_local_multicast(daddr))
1348                        goto e_inval;
1349        } else {
1350                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1351                                          in_dev, &itag);
1352                if (err < 0)
1353                        goto e_err;
1354        }
1355        rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1356                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1357        if (!rth)
1358                goto e_nobufs;
1359
1360#ifdef CONFIG_IP_ROUTE_CLASSID
1361        rth->dst.tclassid = itag;
1362#endif
1363        rth->dst.output = ip_rt_bug;
1364
1365        rth->rt_genid   = rt_genid(dev_net(dev));
1366        rth->rt_flags   = RTCF_MULTICAST;
1367        rth->rt_type    = RTN_MULTICAST;
1368        rth->rt_is_input= 1;
1369        rth->rt_iif     = 0;
1370        rth->rt_pmtu    = 0;
1371        rth->rt_gateway = 0;
1372        rth->rt_uses_gateway = 0;
1373        INIT_LIST_HEAD(&rth->rt_uncached);
1374        if (our) {
1375                rth->dst.input= ip_local_deliver;
1376                rth->rt_flags |= RTCF_LOCAL;
1377        }
1378
1379#ifdef CONFIG_IP_MROUTE
1380        if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1381                rth->dst.input = ip_mr_input;
1382#endif
1383        RT_CACHE_STAT_INC(in_slow_mc);
1384
1385        skb_dst_set(skb, &rth->dst);
1386        return 0;
1387
1388e_nobufs:
1389        return -ENOBUFS;
1390e_inval:
1391        return -EINVAL;
1392e_err:
1393        return err;
1394}
1395
1396
1397static void ip_handle_martian_source(struct net_device *dev,
1398                                     struct in_device *in_dev,
1399                                     struct sk_buff *skb,
1400                                     __be32 daddr,
1401                                     __be32 saddr)
1402{
1403        RT_CACHE_STAT_INC(in_martian_src);
1404#ifdef CONFIG_IP_ROUTE_VERBOSE
1405        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1406                /*
1407                 *      RFC1812 recommendation, if source is martian,
1408                 *      the only hint is MAC header.
1409                 */
1410                pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1411                        &daddr, &saddr, dev->name);
1412                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1413                        print_hex_dump(KERN_WARNING, "ll header: ",
1414                                       DUMP_PREFIX_OFFSET, 16, 1,
1415                                       skb_mac_header(skb),
1416                                       dev->hard_header_len, true);
1417                }
1418        }
1419#endif
1420}
1421
1422/* called in rcu_read_lock() section */
1423static int __mkroute_input(struct sk_buff *skb,
1424                           const struct fib_result *res,
1425                           struct in_device *in_dev,
1426                           __be32 daddr, __be32 saddr, u32 tos)
1427{
1428        struct rtable *rth;
1429        int err;
1430        struct in_device *out_dev;
1431        unsigned int flags = 0;
1432        bool do_cache;
1433        u32 itag;
1434
1435        /* get a working reference to the output device */
1436        out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1437        if (out_dev == NULL) {
1438                net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1439                return -EINVAL;
1440        }
1441
1442        err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1443                                  in_dev->dev, in_dev, &itag);
1444        if (err < 0) {
1445                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1446                                         saddr);
1447
1448                goto cleanup;
1449        }
1450
1451        do_cache = res->fi && !itag;
1452        if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1453            (IN_DEV_SHARED_MEDIA(out_dev) ||
1454             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1455                flags |= RTCF_DOREDIRECT;
1456                do_cache = false;
1457        }
1458
1459        if (skb->protocol != htons(ETH_P_IP)) {
1460                /* Not IP (i.e. ARP). Do not create route, if it is
1461                 * invalid for proxy arp. DNAT routes are always valid.
1462                 *
1463                 * Proxy arp feature have been extended to allow, ARP
1464                 * replies back to the same interface, to support
1465                 * Private VLAN switch technologies. See arp.c.
1466                 */
1467                if (out_dev == in_dev &&
1468                    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1469                        err = -EINVAL;
1470                        goto cleanup;
1471                }
1472        }
1473
1474        if (do_cache) {
1475                rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1476                if (rt_cache_valid(rth)) {
1477                        skb_dst_set_noref(skb, &rth->dst);
1478                        goto out;
1479                }
1480        }
1481
1482        rth = rt_dst_alloc(out_dev->dev,
1483                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
1484                           IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1485        if (!rth) {
1486                err = -ENOBUFS;
1487                goto cleanup;
1488        }
1489
1490        rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1491        rth->rt_flags = flags;
1492        rth->rt_type = res->type;
1493        rth->rt_is_input = 1;
1494        rth->rt_iif     = 0;
1495        rth->rt_pmtu    = 0;
1496        rth->rt_gateway = 0;
1497        rth->rt_uses_gateway = 0;
1498        INIT_LIST_HEAD(&rth->rt_uncached);
1499
1500        rth->dst.input = ip_forward;
1501        rth->dst.output = ip_output;
1502
1503        rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1504        skb_dst_set(skb, &rth->dst);
1505out:
1506        err = 0;
1507 cleanup:
1508        return err;
1509}
1510
1511static int ip_mkroute_input(struct sk_buff *skb,
1512                            struct fib_result *res,
1513                            const struct flowi4 *fl4,
1514                            struct in_device *in_dev,
1515                            __be32 daddr, __be32 saddr, u32 tos)
1516{
1517#ifdef CONFIG_IP_ROUTE_MULTIPATH
1518        if (res->fi && res->fi->fib_nhs > 1)
1519                fib_select_multipath(res);
1520#endif
1521
1522        /* create a routing cache entry */
1523        return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1524}
1525
1526/*
1527 *      NOTE. We drop all the packets that has local source
1528 *      addresses, because every properly looped back packet
1529 *      must have correct destination already attached by output routine.
1530 *
1531 *      Such approach solves two big problems:
1532 *      1. Not simplex devices are handled properly.
1533 *      2. IP spoofing attempts are filtered with 100% of guarantee.
1534 *      called with rcu_read_lock()
1535 */
1536
1537static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1538                               u8 tos, struct net_device *dev)
1539{
1540        struct fib_result res;
1541        struct in_device *in_dev = __in_dev_get_rcu(dev);
1542        struct flowi4   fl4;
1543        unsigned int    flags = 0;
1544        u32             itag = 0;
1545        struct rtable   *rth;
1546        int             err = -EINVAL;
1547        struct net    *net = dev_net(dev);
1548        bool do_cache;
1549
1550        /* IP on this device is disabled. */
1551
1552        if (!in_dev)
1553                goto out;
1554
1555        /* Check for the most weird martians, which can be not detected
1556           by fib_lookup.
1557         */
1558
1559        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1560                goto martian_source;
1561
1562        res.fi = NULL;
1563        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1564                goto brd_input;
1565
1566        /* Accept zero addresses only to limited broadcast;
1567         * I even do not know to fix it or not. Waiting for complains :-)
1568         */
1569        if (ipv4_is_zeronet(saddr))
1570                goto martian_source;
1571
1572        if (ipv4_is_zeronet(daddr))
1573                goto martian_destination;
1574
1575        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
1576                if (ipv4_is_loopback(daddr))
1577                        goto martian_destination;
1578
1579                if (ipv4_is_loopback(saddr))
1580                        goto martian_source;
1581        }
1582
1583        /*
1584         *      Now we are ready to route packet.
1585         */
1586        fl4.flowi4_oif = 0;
1587        fl4.flowi4_iif = dev->ifindex;
1588        fl4.flowi4_mark = skb->mark;
1589        fl4.flowi4_tos = tos;
1590        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1591        fl4.daddr = daddr;
1592        fl4.saddr = saddr;
1593        err = fib_lookup(net, &fl4, &res);
1594        if (err != 0)
1595                goto no_route;
1596
1597        RT_CACHE_STAT_INC(in_slow_tot);
1598
1599        if (res.type == RTN_BROADCAST)
1600                goto brd_input;
1601
1602        if (res.type == RTN_LOCAL) {
1603                err = fib_validate_source(skb, saddr, daddr, tos,
1604                                          net->loopback_dev->ifindex,
1605                                          dev, in_dev, &itag);
1606                if (err < 0)
1607                        goto martian_source_keep_err;
1608                goto local_input;
1609        }
1610
1611        if (!IN_DEV_FORWARD(in_dev))
1612                goto no_route;
1613        if (res.type != RTN_UNICAST)
1614                goto martian_destination;
1615
1616        err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1617out:    return err;
1618
1619brd_input:
1620        if (skb->protocol != htons(ETH_P_IP))
1621                goto e_inval;
1622
1623        if (!ipv4_is_zeronet(saddr)) {
1624                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1625                                          in_dev, &itag);
1626                if (err < 0)
1627                        goto martian_source_keep_err;
1628        }
1629        flags |= RTCF_BROADCAST;
1630        res.type = RTN_BROADCAST;
1631        RT_CACHE_STAT_INC(in_brd);
1632
1633local_input:
1634        do_cache = false;
1635        if (res.fi) {
1636                if (!itag) {
1637                        rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1638                        if (rt_cache_valid(rth)) {
1639                                skb_dst_set_noref(skb, &rth->dst);
1640                                err = 0;
1641                                goto out;
1642                        }
1643                        do_cache = true;
1644                }
1645        }
1646
1647        rth = rt_dst_alloc(net->loopback_dev,
1648                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1649        if (!rth)
1650                goto e_nobufs;
1651
1652        rth->dst.input= ip_local_deliver;
1653        rth->dst.output= ip_rt_bug;
1654#ifdef CONFIG_IP_ROUTE_CLASSID
1655        rth->dst.tclassid = itag;
1656#endif
1657
1658        rth->rt_genid = rt_genid(net);
1659        rth->rt_flags   = flags|RTCF_LOCAL;
1660        rth->rt_type    = res.type;
1661        rth->rt_is_input = 1;
1662        rth->rt_iif     = 0;
1663        rth->rt_pmtu    = 0;
1664        rth->rt_gateway = 0;
1665        rth->rt_uses_gateway = 0;
1666        INIT_LIST_HEAD(&rth->rt_uncached);
1667        if (res.type == RTN_UNREACHABLE) {
1668                rth->dst.input= ip_error;
1669                rth->dst.error= -err;
1670                rth->rt_flags   &= ~RTCF_LOCAL;
1671        }
1672        if (do_cache)
1673                rt_cache_route(&FIB_RES_NH(res), rth);
1674        skb_dst_set(skb, &rth->dst);
1675        err = 0;
1676        goto out;
1677
1678no_route:
1679        RT_CACHE_STAT_INC(in_no_route);
1680        res.type = RTN_UNREACHABLE;
1681        if (err == -ESRCH)
1682                err = -ENETUNREACH;
1683        goto local_input;
1684
1685        /*
1686         *      Do not cache martian addresses: they should be logged (RFC1812)
1687         */
1688martian_destination:
1689        RT_CACHE_STAT_INC(in_martian_dst);
1690#ifdef CONFIG_IP_ROUTE_VERBOSE
1691        if (IN_DEV_LOG_MARTIANS(in_dev))
1692                net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1693                                     &daddr, &saddr, dev->name);
1694#endif
1695
1696e_inval:
1697        err = -EINVAL;
1698        goto out;
1699
1700e_nobufs:
1701        err = -ENOBUFS;
1702        goto out;
1703
1704martian_source:
1705        err = -EINVAL;
1706martian_source_keep_err:
1707        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1708        goto out;
1709}
1710
1711int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1712                         u8 tos, struct net_device *dev)
1713{
1714        int res;
1715
1716        rcu_read_lock();
1717
1718        /* Multicast recognition logic is moved from route cache to here.
1719           The problem was that too many Ethernet cards have broken/missing
1720           hardware multicast filters :-( As result the host on multicasting
1721           network acquires a lot of useless route cache entries, sort of
1722           SDR messages from all the world. Now we try to get rid of them.
1723           Really, provided software IP multicast filter is organized
1724           reasonably (at least, hashed), it does not result in a slowdown
1725           comparing with route cache reject entries.
1726           Note, that multicast routers are not affected, because
1727           route cache entry is created eventually.
1728         */
1729        if (ipv4_is_multicast(daddr)) {
1730                struct in_device *in_dev = __in_dev_get_rcu(dev);
1731
1732                if (in_dev) {
1733                        int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1734                                                  ip_hdr(skb)->protocol);
1735                        if (our
1736#ifdef CONFIG_IP_MROUTE
1737                                ||
1738                            (!ipv4_is_local_multicast(daddr) &&
1739                             IN_DEV_MFORWARD(in_dev))
1740#endif
1741                           ) {
1742                                int res = ip_route_input_mc(skb, daddr, saddr,
1743                                                            tos, dev, our);
1744                                rcu_read_unlock();
1745                                return res;
1746                        }
1747                }
1748                rcu_read_unlock();
1749                return -EINVAL;
1750        }
1751        res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1752        rcu_read_unlock();
1753        return res;
1754}
1755EXPORT_SYMBOL(ip_route_input_noref);
1756
1757/* called with rcu_read_lock() */
1758static struct rtable *__mkroute_output(const struct fib_result *res,
1759                                       const struct flowi4 *fl4, int orig_oif,
1760                                       struct net_device *dev_out,
1761                                       unsigned int flags)
1762{
1763        struct fib_info *fi = res->fi;
1764        struct fib_nh_exception *fnhe;
1765        struct in_device *in_dev;
1766        u16 type = res->type;
1767        struct rtable *rth;
1768        bool do_cache;
1769
1770        in_dev = __in_dev_get_rcu(dev_out);
1771        if (!in_dev)
1772                return ERR_PTR(-EINVAL);
1773
1774        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1775                if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1776                        return ERR_PTR(-EINVAL);
1777
1778        if (ipv4_is_lbcast(fl4->daddr))
1779                type = RTN_BROADCAST;
1780        else if (ipv4_is_multicast(fl4->daddr))
1781                type = RTN_MULTICAST;
1782        else if (ipv4_is_zeronet(fl4->daddr))
1783                return ERR_PTR(-EINVAL);
1784
1785        if (dev_out->flags & IFF_LOOPBACK)
1786                flags |= RTCF_LOCAL;
1787
1788        if (type == RTN_BROADCAST) {
1789                flags |= RTCF_BROADCAST | RTCF_LOCAL;
1790                fi = NULL;
1791        } else if (type == RTN_MULTICAST) {
1792                flags |= RTCF_MULTICAST | RTCF_LOCAL;
1793                if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1794                                     fl4->flowi4_proto))
1795                        flags &= ~RTCF_LOCAL;
1796                /* If multicast route do not exist use
1797                 * default one, but do not gateway in this case.
1798                 * Yes, it is hack.
1799                 */
1800                if (fi && res->prefixlen < 4)
1801                        fi = NULL;
1802        }
1803
1804        fnhe = NULL;
1805        do_cache = fi != NULL;
1806        if (fi) {
1807                struct rtable __rcu **prth;
1808                struct fib_nh *nh = &FIB_RES_NH(*res);
1809
1810                fnhe = find_exception(nh, fl4->daddr);
1811                if (fnhe)
1812                        prth = &fnhe->fnhe_rth;
1813                else {
1814                        if (unlikely(fl4->flowi4_flags &
1815                                     FLOWI_FLAG_KNOWN_NH &&
1816                                     !(nh->nh_gw &&
1817                                       nh->nh_scope == RT_SCOPE_LINK))) {
1818                                do_cache = false;
1819                                goto add;
1820                        }
1821                        prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1822                }
1823                rth = rcu_dereference(*prth);
1824                if (rt_cache_valid(rth)) {
1825                        dst_hold(&rth->dst);
1826                        return rth;
1827                }
1828        }
1829
1830add:
1831        rth = rt_dst_alloc(dev_out,
1832                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
1833                           IN_DEV_CONF_GET(in_dev, NOXFRM),
1834                           do_cache);
1835        if (!rth)
1836                return ERR_PTR(-ENOBUFS);
1837
1838        rth->dst.output = ip_output;
1839
1840        rth->rt_genid = rt_genid(dev_net(dev_out));
1841        rth->rt_flags   = flags;
1842        rth->rt_type    = type;
1843        rth->rt_is_input = 0;
1844        rth->rt_iif     = orig_oif ? : 0;
1845        rth->rt_pmtu    = 0;
1846        rth->rt_gateway = 0;
1847        rth->rt_uses_gateway = 0;
1848        INIT_LIST_HEAD(&rth->rt_uncached);
1849
1850        RT_CACHE_STAT_INC(out_slow_tot);
1851
1852        if (flags & RTCF_LOCAL)
1853                rth->dst.input = ip_local_deliver;
1854        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1855                if (flags & RTCF_LOCAL &&
1856                    !(dev_out->flags & IFF_LOOPBACK)) {
1857                        rth->dst.output = ip_mc_output;
1858                        RT_CACHE_STAT_INC(out_slow_mc);
1859                }
1860#ifdef CONFIG_IP_MROUTE
1861                if (type == RTN_MULTICAST) {
1862                        if (IN_DEV_MFORWARD(in_dev) &&
1863                            !ipv4_is_local_multicast(fl4->daddr)) {
1864                                rth->dst.input = ip_mr_input;
1865                                rth->dst.output = ip_mc_output;
1866                        }
1867                }
1868#endif
1869        }
1870
1871        rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1872
1873        return rth;
1874}
1875
1876/*
1877 * Major route resolver routine.
1878 */
1879
1880struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1881{
1882        struct net_device *dev_out = NULL;
1883        __u8 tos = RT_FL_TOS(fl4);
1884        unsigned int flags = 0;
1885        struct fib_result res;
1886        struct rtable *rth;
1887        int orig_oif;
1888
1889        res.tclassid    = 0;
1890        res.fi          = NULL;
1891        res.table       = NULL;
1892
1893        orig_oif = fl4->flowi4_oif;
1894
1895        fl4->flowi4_iif = net->loopback_dev->ifindex;
1896        fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1897        fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1898                         RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1899
1900        rcu_read_lock();
1901        if (fl4->saddr) {
1902                rth = ERR_PTR(-EINVAL);
1903                if (ipv4_is_multicast(fl4->saddr) ||
1904                    ipv4_is_lbcast(fl4->saddr) ||
1905                    ipv4_is_zeronet(fl4->saddr))
1906                        goto out;
1907
1908                /* I removed check for oif == dev_out->oif here.
1909                   It was wrong for two reasons:
1910                   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1911                      is assigned to multiple interfaces.
1912                   2. Moreover, we are allowed to send packets with saddr
1913                      of another iface. --ANK
1914                 */
1915
1916                if (fl4->flowi4_oif == 0 &&
1917                    (ipv4_is_multicast(fl4->daddr) ||
1918                     ipv4_is_lbcast(fl4->daddr))) {
1919                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1920                        dev_out = __ip_dev_find(net, fl4->saddr, false);
1921                        if (dev_out == NULL)
1922                                goto out;
1923
1924                        /* Special hack: user can direct multicasts
1925                           and limited broadcast via necessary interface
1926                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1927                           This hack is not just for fun, it allows
1928                           vic,vat and friends to work.
1929                           They bind socket to loopback, set ttl to zero
1930                           and expect that it will work.
1931                           From the viewpoint of routing cache they are broken,
1932                           because we are not allowed to build multicast path
1933                           with loopback source addr (look, routing cache
1934                           cannot know, that ttl is zero, so that packet
1935                           will not leave this host and route is valid).
1936                           Luckily, this hack is good workaround.
1937                         */
1938
1939                        fl4->flowi4_oif = dev_out->ifindex;
1940                        goto make_route;
1941                }
1942
1943                if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1944                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1945                        if (!__ip_dev_find(net, fl4->saddr, false))
1946                                goto out;
1947                }
1948        }
1949
1950
1951        if (fl4->flowi4_oif) {
1952                dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
1953                rth = ERR_PTR(-ENODEV);
1954                if (dev_out == NULL)
1955                        goto out;
1956
1957                /* RACE: Check return value of inet_select_addr instead. */
1958                if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
1959                        rth = ERR_PTR(-ENETUNREACH);
1960                        goto out;
1961                }
1962                if (ipv4_is_local_multicast(fl4->daddr) ||
1963                    ipv4_is_lbcast(fl4->daddr)) {
1964                        if (!fl4->saddr)
1965                                fl4->saddr = inet_select_addr(dev_out, 0,
1966                                                              RT_SCOPE_LINK);
1967                        goto make_route;
1968                }
1969                if (fl4->saddr) {
1970                        if (ipv4_is_multicast(fl4->daddr))
1971                                fl4->saddr = inet_select_addr(dev_out, 0,
1972                                                              fl4->flowi4_scope);
1973                        else if (!fl4->daddr)
1974                                fl4->saddr = inet_select_addr(dev_out, 0,
1975                                                              RT_SCOPE_HOST);
1976                }
1977        }
1978
1979        if (!fl4->daddr) {
1980                fl4->daddr = fl4->saddr;
1981                if (!fl4->daddr)
1982                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1983                dev_out = net->loopback_dev;
1984                fl4->flowi4_oif = net->loopback_dev->ifindex;
1985                res.type = RTN_LOCAL;
1986                flags |= RTCF_LOCAL;
1987                goto make_route;
1988        }
1989
1990        if (fib_lookup(net, fl4, &res)) {
1991                res.fi = NULL;
1992                res.table = NULL;
1993                if (fl4->flowi4_oif) {
1994                        /* Apparently, routing tables are wrong. Assume,
1995                           that the destination is on link.
1996
1997                           WHY? DW.
1998                           Because we are allowed to send to iface
1999                           even if it has NO routes and NO assigned
2000                           addresses. When oif is specified, routing
2001                           tables are looked up with only one purpose:
2002                           to catch if destination is gatewayed, rather than
2003                           direct. Moreover, if MSG_DONTROUTE is set,
2004                           we send packet, ignoring both routing tables
2005                           and ifaddr state. --ANK
2006
2007
2008                           We could make it even if oif is unknown,
2009                           likely IPv6, but we do not.
2010                         */
2011
2012                        if (fl4->saddr == 0)
2013                                fl4->saddr = inet_select_addr(dev_out, 0,
2014                                                              RT_SCOPE_LINK);
2015                        res.type = RTN_UNICAST;
2016                        goto make_route;
2017                }
2018                rth = ERR_PTR(-ENETUNREACH);
2019                goto out;
2020        }
2021
2022        if (res.type == RTN_LOCAL) {
2023                if (!fl4->saddr) {
2024                        if (res.fi->fib_prefsrc)
2025                                fl4->saddr = res.fi->fib_prefsrc;
2026                        else
2027                                fl4->saddr = fl4->daddr;
2028                }
2029                dev_out = net->loopback_dev;
2030                fl4->flowi4_oif = dev_out->ifindex;
2031                flags |= RTCF_LOCAL;
2032                goto make_route;
2033        }
2034
2035#ifdef CONFIG_IP_ROUTE_MULTIPATH
2036        if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2037                fib_select_multipath(&res);
2038        else
2039#endif
2040        if (!res.prefixlen &&
2041            res.table->tb_num_default > 1 &&
2042            res.type == RTN_UNICAST && !fl4->flowi4_oif)
2043                fib_select_default(&res);
2044
2045        if (!fl4->saddr)
2046                fl4->saddr = FIB_RES_PREFSRC(net, res);
2047
2048        dev_out = FIB_RES_DEV(res);
2049        fl4->flowi4_oif = dev_out->ifindex;
2050
2051
2052make_route:
2053        rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2054
2055out:
2056        rcu_read_unlock();
2057        return rth;
2058}
2059EXPORT_SYMBOL_GPL(__ip_route_output_key);
2060
2061static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2062{
2063        return NULL;
2064}
2065
2066static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2067{
2068        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2069
2070        return mtu ? : dst->dev->mtu;
2071}
2072
2073static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2074                                          struct sk_buff *skb, u32 mtu)
2075{
2076}
2077
2078static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2079                                       struct sk_buff *skb)
2080{
2081}
2082
2083static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2084                                          unsigned long old)
2085{
2086        return NULL;
2087}
2088
2089static struct dst_ops ipv4_dst_blackhole_ops = {
2090        .family                 =       AF_INET,
2091        .protocol               =       cpu_to_be16(ETH_P_IP),
2092        .check                  =       ipv4_blackhole_dst_check,
2093        .mtu                    =       ipv4_blackhole_mtu,
2094        .default_advmss         =       ipv4_default_advmss,
2095        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2096        .redirect               =       ipv4_rt_blackhole_redirect,
2097        .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2098        .neigh_lookup           =       ipv4_neigh_lookup,
2099};
2100
2101struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2102{
2103        struct rtable *ort = (struct rtable *) dst_orig;
2104        struct rtable *rt;
2105
2106        rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2107        if (rt) {
2108                struct dst_entry *new = &rt->dst;
2109
2110                new->__use = 1;
2111                new->input = dst_discard;
2112                new->output = dst_discard;
2113
2114                new->dev = ort->dst.dev;
2115                if (new->dev)
2116                        dev_hold(new->dev);
2117
2118                rt->rt_is_input = ort->rt_is_input;
2119                rt->rt_iif = ort->rt_iif;
2120                rt->rt_pmtu = ort->rt_pmtu;
2121
2122                rt->rt_genid = rt_genid(net);
2123                rt->rt_flags = ort->rt_flags;
2124                rt->rt_type = ort->rt_type;
2125                rt->rt_gateway = ort->rt_gateway;
2126                rt->rt_uses_gateway = ort->rt_uses_gateway;
2127
2128                INIT_LIST_HEAD(&rt->rt_uncached);
2129
2130                dst_free(new);
2131        }
2132
2133        dst_release(dst_orig);
2134
2135        return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2136}
2137
2138struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2139                                    struct sock *sk)
2140{
2141        struct rtable *rt = __ip_route_output_key(net, flp4);
2142
2143        if (IS_ERR(rt))
2144                return rt;
2145
2146        if (flp4->flowi4_proto)
2147                rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2148                                                   flowi4_to_flowi(flp4),
2149                                                   sk, 0);
2150
2151        return rt;
2152}
2153EXPORT_SYMBOL_GPL(ip_route_output_flow);
2154
2155static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2156                        struct flowi4 *fl4, struct sk_buff *skb, u32 pid,
2157                        u32 seq, int event, int nowait, unsigned int flags)
2158{
2159        struct rtable *rt = skb_rtable(skb);
2160        struct rtmsg *r;
2161        struct nlmsghdr *nlh;
2162        unsigned long expires = 0;
2163        u32 error;
2164        u32 metrics[RTAX_MAX];
2165
2166        nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2167        if (nlh == NULL)
2168                return -EMSGSIZE;
2169
2170        r = nlmsg_data(nlh);
2171        r->rtm_family    = AF_INET;
2172        r->rtm_dst_len  = 32;
2173        r->rtm_src_len  = 0;
2174        r->rtm_tos      = fl4->flowi4_tos;
2175        r->rtm_table    = RT_TABLE_MAIN;
2176        if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2177                goto nla_put_failure;
2178        r->rtm_type     = rt->rt_type;
2179        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2180        r->rtm_protocol = RTPROT_UNSPEC;
2181        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2182        if (rt->rt_flags & RTCF_NOTIFY)
2183                r->rtm_flags |= RTM_F_NOTIFY;
2184
2185        if (nla_put_be32(skb, RTA_DST, dst))
2186                goto nla_put_failure;
2187        if (src) {
2188                r->rtm_src_len = 32;
2189                if (nla_put_be32(skb, RTA_SRC, src))
2190                        goto nla_put_failure;
2191        }
2192        if (rt->dst.dev &&
2193            nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2194                goto nla_put_failure;
2195#ifdef CONFIG_IP_ROUTE_CLASSID
2196        if (rt->dst.tclassid &&
2197            nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2198                goto nla_put_failure;
2199#endif
2200        if (!rt_is_input_route(rt) &&
2201            fl4->saddr != src) {
2202                if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2203                        goto nla_put_failure;
2204        }
2205        if (rt->rt_uses_gateway &&
2206            nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2207                goto nla_put_failure;
2208
2209        expires = rt->dst.expires;
2210        if (expires) {
2211                unsigned long now = jiffies;
2212
2213                if (time_before(now, expires))
2214                        expires -= now;
2215                else
2216                        expires = 0;
2217        }
2218
2219        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2220        if (rt->rt_pmtu && expires)
2221                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2222        if (rtnetlink_put_metrics(skb, metrics) < 0)
2223                goto nla_put_failure;
2224
2225        if (fl4->flowi4_mark &&
2226            nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
2227                goto nla_put_failure;
2228
2229        error = rt->dst.error;
2230
2231        if (rt_is_input_route(rt)) {
2232                if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2233                        goto nla_put_failure;
2234        }
2235
2236        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2237                goto nla_put_failure;
2238
2239        return nlmsg_end(skb, nlh);
2240
2241nla_put_failure:
2242        nlmsg_cancel(skb, nlh);
2243        return -EMSGSIZE;
2244}
2245
2246static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2247{
2248        struct net *net = sock_net(in_skb->sk);
2249        struct rtmsg *rtm;
2250        struct nlattr *tb[RTA_MAX+1];
2251        struct rtable *rt = NULL;
2252        struct flowi4 fl4;
2253        __be32 dst = 0;
2254        __be32 src = 0;
2255        u32 iif;
2256        int err;
2257        int mark;
2258        struct sk_buff *skb;
2259
2260        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2261        if (err < 0)
2262                goto errout;
2263
2264        rtm = nlmsg_data(nlh);
2265
2266        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2267        if (skb == NULL) {
2268                err = -ENOBUFS;
2269                goto errout;
2270        }
2271
2272        /* Reserve room for dummy headers, this skb can pass
2273           through good chunk of routing engine.
2274         */
2275        skb_reset_mac_header(skb);
2276        skb_reset_network_header(skb);
2277
2278        /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2279        ip_hdr(skb)->protocol = IPPROTO_ICMP;
2280        skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2281
2282        src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2283        dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2284        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2285        mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2286
2287        memset(&fl4, 0, sizeof(fl4));
2288        fl4.daddr = dst;
2289        fl4.saddr = src;
2290        fl4.flowi4_tos = rtm->rtm_tos;
2291        fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2292        fl4.flowi4_mark = mark;
2293
2294        if (iif) {
2295                struct net_device *dev;
2296
2297                dev = __dev_get_by_index(net, iif);
2298                if (dev == NULL) {
2299                        err = -ENODEV;
2300                        goto errout_free;
2301                }
2302
2303                skb->protocol   = htons(ETH_P_IP);
2304                skb->dev        = dev;
2305                skb->mark       = mark;
2306                local_bh_disable();
2307                err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2308                local_bh_enable();
2309
2310                rt = skb_rtable(skb);
2311                if (err == 0 && rt->dst.error)
2312                        err = -rt->dst.error;
2313        } else {
2314                rt = ip_route_output_key(net, &fl4);
2315
2316                err = 0;
2317                if (IS_ERR(rt))
2318                        err = PTR_ERR(rt);
2319        }
2320
2321        if (err)
2322                goto errout_free;
2323
2324        skb_dst_set(skb, &rt->dst);
2325        if (rtm->rtm_flags & RTM_F_NOTIFY)
2326                rt->rt_flags |= RTCF_NOTIFY;
2327
2328        err = rt_fill_info(net, dst, src, &fl4, skb,
2329                           NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2330                           RTM_NEWROUTE, 0, 0);
2331        if (err <= 0)
2332                goto errout_free;
2333
2334        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2335errout:
2336        return err;
2337
2338errout_free:
2339        kfree_skb(skb);
2340        goto errout;
2341}
2342
2343int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2344{
2345        return skb->len;
2346}
2347
2348void ip_rt_multicast_event(struct in_device *in_dev)
2349{
2350        rt_cache_flush(dev_net(in_dev->dev));
2351}
2352
2353#ifdef CONFIG_SYSCTL
2354static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2355                                        void __user *buffer,
2356                                        size_t *lenp, loff_t *ppos)
2357{
2358        if (write) {
2359                rt_cache_flush((struct net *)__ctl->extra1);
2360                return 0;
2361        }
2362
2363        return -EINVAL;
2364}
2365
2366static ctl_table ipv4_route_table[] = {
2367        {
2368                .procname       = "gc_thresh",
2369                .data           = &ipv4_dst_ops.gc_thresh,
2370                .maxlen         = sizeof(int),
2371                .mode           = 0644,
2372                .proc_handler   = proc_dointvec,
2373        },
2374        {
2375                .procname       = "max_size",
2376                .data           = &ip_rt_max_size,
2377                .maxlen         = sizeof(int),
2378                .mode           = 0644,
2379                .proc_handler   = proc_dointvec,
2380        },
2381        {
2382                /*  Deprecated. Use gc_min_interval_ms */
2383
2384                .procname       = "gc_min_interval",
2385                .data           = &ip_rt_gc_min_interval,
2386                .maxlen         = sizeof(int),
2387                .mode           = 0644,
2388                .proc_handler   = proc_dointvec_jiffies,
2389        },
2390        {
2391                .procname       = "gc_min_interval_ms",
2392                .data           = &ip_rt_gc_min_interval,
2393                .maxlen         = sizeof(int),
2394                .mode           = 0644,
2395                .proc_handler   = proc_dointvec_ms_jiffies,
2396        },
2397        {
2398                .procname       = "gc_timeout",
2399                .data           = &ip_rt_gc_timeout,
2400                .maxlen         = sizeof(int),
2401                .mode           = 0644,
2402                .proc_handler   = proc_dointvec_jiffies,
2403        },
2404        {
2405                .procname       = "gc_interval",
2406                .data           = &ip_rt_gc_interval,
2407                .maxlen         = sizeof(int),
2408                .mode           = 0644,
2409                .proc_handler   = proc_dointvec_jiffies,
2410        },
2411        {
2412                .procname       = "redirect_load",
2413                .data           = &ip_rt_redirect_load,
2414                .maxlen         = sizeof(int),
2415                .mode           = 0644,
2416                .proc_handler   = proc_dointvec,
2417        },
2418        {
2419                .procname       = "redirect_number",
2420                .data           = &ip_rt_redirect_number,
2421                .maxlen         = sizeof(int),
2422                .mode           = 0644,
2423                .proc_handler   = proc_dointvec,
2424        },
2425        {
2426                .procname       = "redirect_silence",
2427                .data           = &ip_rt_redirect_silence,
2428                .maxlen         = sizeof(int),
2429                .mode           = 0644,
2430                .proc_handler   = proc_dointvec,
2431        },
2432        {
2433                .procname       = "error_cost",
2434                .data           = &ip_rt_error_cost,
2435                .maxlen         = sizeof(int),
2436                .mode           = 0644,
2437                .proc_handler   = proc_dointvec,
2438        },
2439        {
2440                .procname       = "error_burst",
2441                .data           = &ip_rt_error_burst,
2442                .maxlen         = sizeof(int),
2443                .mode           = 0644,
2444                .proc_handler   = proc_dointvec,
2445        },
2446        {
2447                .procname       = "gc_elasticity",
2448                .data           = &ip_rt_gc_elasticity,
2449                .maxlen         = sizeof(int),
2450                .mode           = 0644,
2451                .proc_handler   = proc_dointvec,
2452        },
2453        {
2454                .procname       = "mtu_expires",
2455                .data           = &ip_rt_mtu_expires,
2456                .maxlen         = sizeof(int),
2457                .mode           = 0644,
2458                .proc_handler   = proc_dointvec_jiffies,
2459        },
2460        {
2461                .procname       = "min_pmtu",
2462                .data           = &ip_rt_min_pmtu,
2463                .maxlen         = sizeof(int),
2464                .mode           = 0644,
2465                .proc_handler   = proc_dointvec,
2466        },
2467        {
2468                .procname       = "min_adv_mss",
2469                .data           = &ip_rt_min_advmss,
2470                .maxlen         = sizeof(int),
2471                .mode           = 0644,
2472                .proc_handler   = proc_dointvec,
2473        },
2474        { }
2475};
2476
2477static struct ctl_table ipv4_route_flush_table[] = {
2478        {
2479                .procname       = "flush",
2480                .maxlen         = sizeof(int),
2481                .mode           = 0200,
2482                .proc_handler   = ipv4_sysctl_rtcache_flush,
2483        },
2484        { },
2485};
2486
2487static __net_init int sysctl_route_net_init(struct net *net)
2488{
2489        struct ctl_table *tbl;
2490
2491        tbl = ipv4_route_flush_table;
2492        if (!net_eq(net, &init_net)) {
2493                tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2494                if (tbl == NULL)
2495                        goto err_dup;
2496        }
2497        tbl[0].extra1 = net;
2498
2499        net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2500        if (net->ipv4.route_hdr == NULL)
2501                goto err_reg;
2502        return 0;
2503
2504err_reg:
2505        if (tbl != ipv4_route_flush_table)
2506                kfree(tbl);
2507err_dup:
2508        return -ENOMEM;
2509}
2510
2511static __net_exit void sysctl_route_net_exit(struct net *net)
2512{
2513        struct ctl_table *tbl;
2514
2515        tbl = net->ipv4.route_hdr->ctl_table_arg;
2516        unregister_net_sysctl_table(net->ipv4.route_hdr);
2517        BUG_ON(tbl == ipv4_route_flush_table);
2518        kfree(tbl);
2519}
2520
2521static __net_initdata struct pernet_operations sysctl_route_ops = {
2522        .init = sysctl_route_net_init,
2523        .exit = sysctl_route_net_exit,
2524};
2525#endif
2526
2527static __net_init int rt_genid_init(struct net *net)
2528{
2529        atomic_set(&net->rt_genid, 0);
2530        get_random_bytes(&net->ipv4.dev_addr_genid,
2531                         sizeof(net->ipv4.dev_addr_genid));
2532        return 0;
2533}
2534
2535static __net_initdata struct pernet_operations rt_genid_ops = {
2536        .init = rt_genid_init,
2537};
2538
2539static int __net_init ipv4_inetpeer_init(struct net *net)
2540{
2541        struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2542
2543        if (!bp)
2544                return -ENOMEM;
2545        inet_peer_base_init(bp);
2546        net->ipv4.peers = bp;
2547        return 0;
2548}
2549
2550static void __net_exit ipv4_inetpeer_exit(struct net *net)
2551{
2552        struct inet_peer_base *bp = net->ipv4.peers;
2553
2554        net->ipv4.peers = NULL;
2555        inetpeer_invalidate_tree(bp);
2556        kfree(bp);
2557}
2558
2559static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2560        .init   =       ipv4_inetpeer_init,
2561        .exit   =       ipv4_inetpeer_exit,
2562};
2563
2564#ifdef CONFIG_IP_ROUTE_CLASSID
2565struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2566#endif /* CONFIG_IP_ROUTE_CLASSID */
2567
2568int __init ip_rt_init(void)
2569{
2570        int rc = 0;
2571
2572#ifdef CONFIG_IP_ROUTE_CLASSID
2573        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2574        if (!ip_rt_acct)
2575                panic("IP: failed to allocate ip_rt_acct\n");
2576#endif
2577
2578        ipv4_dst_ops.kmem_cachep =
2579                kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2580                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2581
2582        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2583
2584        if (dst_entries_init(&ipv4_dst_ops) < 0)
2585                panic("IP: failed to allocate ipv4_dst_ops counter\n");
2586
2587        if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2588                panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2589
2590        ipv4_dst_ops.gc_thresh = ~0;
2591        ip_rt_max_size = INT_MAX;
2592
2593        devinet_init();
2594        ip_fib_init();
2595
2596        if (ip_rt_proc_init())
2597                pr_err("Unable to create route proc files\n");
2598#ifdef CONFIG_XFRM
2599        xfrm_init();
2600        xfrm4_init(ip_rt_max_size);
2601#endif
2602        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2603
2604#ifdef CONFIG_SYSCTL
2605        register_pernet_subsys(&sysctl_route_ops);
2606#endif
2607        register_pernet_subsys(&rt_genid_ops);
2608        register_pernet_subsys(&ipv4_inetpeer_ops);
2609        return rc;
2610}
2611
2612#ifdef CONFIG_SYSCTL
2613/*
2614 * We really need to sanitize the damn ipv4 init order, then all
2615 * this nonsense will go away.
2616 */
2617void __init ip_static_sysctl_init(void)
2618{
2619        register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2620}
2621#endif
2622
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.