linux/net/ipv4/route.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              ROUTE - implementation of the IP router.
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13 *
  14 * Fixes:
  15 *              Alan Cox        :       Verify area fixes.
  16 *              Alan Cox        :       cli() protects routing changes
  17 *              Rui Oliveira    :       ICMP routing table updates
  18 *              (rco@di.uminho.pt)      Routing table insertion and update
  19 *              Linus Torvalds  :       Rewrote bits to be sensible
  20 *              Alan Cox        :       Added BSD route gw semantics
  21 *              Alan Cox        :       Super /proc >4K
  22 *              Alan Cox        :       MTU in route table
  23 *              Alan Cox        :       MSS actually. Also added the window
  24 *                                      clamper.
  25 *              Sam Lantinga    :       Fixed route matching in rt_del()
  26 *              Alan Cox        :       Routing cache support.
  27 *              Alan Cox        :       Removed compatibility cruft.
  28 *              Alan Cox        :       RTF_REJECT support.
  29 *              Alan Cox        :       TCP irtt support.
  30 *              Jonathan Naylor :       Added Metric support.
  31 *      Miquel van Smoorenburg  :       BSD API fixes.
  32 *      Miquel van Smoorenburg  :       Metrics.
  33 *              Alan Cox        :       Use __u32 properly
  34 *              Alan Cox        :       Aligned routing errors more closely with BSD
  35 *                                      our system is still very different.
  36 *              Alan Cox        :       Faster /proc handling
  37 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38 *                                      routing caches and better behaviour.
  39 *
  40 *              Olaf Erb        :       irtt wasn't being copied right.
  41 *              Bjorn Ekwall    :       Kerneld route support.
  42 *              Alan Cox        :       Multicast fixed (I hope)
  43 *              Pavel Krauz     :       Limited broadcast fixed
  44 *              Mike McLagan    :       Routing by source
  45 *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46 *                                      route.c and rewritten from scratch.
  47 *              Andi Kleen      :       Load-limit warning messages.
  48 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52 *              Marc Boucher    :       routing by fwmark
  53 *      Robert Olsson           :       Added rt_cache statistics
  54 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55 *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56 *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57 *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58 *
  59 *              This program is free software; you can redistribute it and/or
  60 *              modify it under the terms of the GNU General Public License
  61 *              as published by the Free Software Foundation; either version
  62 *              2 of the License, or (at your option) any later version.
  63 */
  64
  65#define pr_fmt(fmt) "IPv4: " fmt
  66
  67#include <linux/module.h>
  68#include <asm/uaccess.h>
  69#include <linux/bitops.h>
  70#include <linux/types.h>
  71#include <linux/kernel.h>
  72#include <linux/mm.h>
  73#include <linux/string.h>
  74#include <linux/socket.h>
  75#include <linux/sockios.h>
  76#include <linux/errno.h>
  77#include <linux/in.h>
  78#include <linux/inet.h>
  79#include <linux/netdevice.h>
  80#include <linux/proc_fs.h>
  81#include <linux/init.h>
  82#include <linux/skbuff.h>
  83#include <linux/inetdevice.h>
  84#include <linux/igmp.h>
  85#include <linux/pkt_sched.h>
  86#include <linux/mroute.h>
  87#include <linux/netfilter_ipv4.h>
  88#include <linux/random.h>
  89#include <linux/rcupdate.h>
  90#include <linux/times.h>
  91#include <linux/slab.h>
  92#include <net/dst.h>
  93#include <net/net_namespace.h>
  94#include <net/protocol.h>
  95#include <net/ip.h>
  96#include <net/route.h>
  97#include <net/inetpeer.h>
  98#include <net/sock.h>
  99#include <net/ip_fib.h>
 100#include <net/arp.h>
 101#include <net/tcp.h>
 102#include <net/icmp.h>
 103#include <net/xfrm.h>
 104#include <net/netevent.h>
 105#include <net/rtnetlink.h>
 106#ifdef CONFIG_SYSCTL
 107#include <linux/sysctl.h>
 108#include <linux/kmemleak.h>
 109#endif
 110#include <net/secure_seq.h>
 111
 112#define RT_FL_TOS(oldflp4) \
 113        ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 114
 115#define IP_MAX_MTU      0xFFF0
 116
 117#define RT_GC_TIMEOUT (300*HZ)
 118
 119static int ip_rt_max_size;
 120static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 121static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 122static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 123static int ip_rt_redirect_number __read_mostly  = 9;
 124static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 125static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 126static int ip_rt_error_cost __read_mostly       = HZ;
 127static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 128static int ip_rt_gc_elasticity __read_mostly    = 8;
 129static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 130static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 131static int ip_rt_min_advmss __read_mostly       = 256;
 132
 133/*
 134 *      Interface to generic destination cache.
 135 */
 136
 137static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 138static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 139static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 140static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 141static void              ipv4_link_failure(struct sk_buff *skb);
 142static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 143                                           struct sk_buff *skb, u32 mtu);
 144static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 145                                        struct sk_buff *skb);
 146static void             ipv4_dst_destroy(struct dst_entry *dst);
 147
 148static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 149                            int how)
 150{
 151}
 152
 153static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 154{
 155        WARN_ON(1);
 156        return NULL;
 157}
 158
 159static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 160                                           struct sk_buff *skb,
 161                                           const void *daddr);
 162
 163static struct dst_ops ipv4_dst_ops = {
 164        .family =               AF_INET,
 165        .protocol =             cpu_to_be16(ETH_P_IP),
 166        .check =                ipv4_dst_check,
 167        .default_advmss =       ipv4_default_advmss,
 168        .mtu =                  ipv4_mtu,
 169        .cow_metrics =          ipv4_cow_metrics,
 170        .destroy =              ipv4_dst_destroy,
 171        .ifdown =               ipv4_dst_ifdown,
 172        .negative_advice =      ipv4_negative_advice,
 173        .link_failure =         ipv4_link_failure,
 174        .update_pmtu =          ip_rt_update_pmtu,
 175        .redirect =             ip_do_redirect,
 176        .local_out =            __ip_local_out,
 177        .neigh_lookup =         ipv4_neigh_lookup,
 178};
 179
 180#define ECN_OR_COST(class)      TC_PRIO_##class
 181
 182const __u8 ip_tos2prio[16] = {
 183        TC_PRIO_BESTEFFORT,
 184        ECN_OR_COST(BESTEFFORT),
 185        TC_PRIO_BESTEFFORT,
 186        ECN_OR_COST(BESTEFFORT),
 187        TC_PRIO_BULK,
 188        ECN_OR_COST(BULK),
 189        TC_PRIO_BULK,
 190        ECN_OR_COST(BULK),
 191        TC_PRIO_INTERACTIVE,
 192        ECN_OR_COST(INTERACTIVE),
 193        TC_PRIO_INTERACTIVE,
 194        ECN_OR_COST(INTERACTIVE),
 195        TC_PRIO_INTERACTIVE_BULK,
 196        ECN_OR_COST(INTERACTIVE_BULK),
 197        TC_PRIO_INTERACTIVE_BULK,
 198        ECN_OR_COST(INTERACTIVE_BULK)
 199};
 200EXPORT_SYMBOL(ip_tos2prio);
 201
 202static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 203#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 204
 205#ifdef CONFIG_PROC_FS
 206static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 207{
 208        if (*pos)
 209                return NULL;
 210        return SEQ_START_TOKEN;
 211}
 212
 213static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 214{
 215        ++*pos;
 216        return NULL;
 217}
 218
 219static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 220{
 221}
 222
 223static int rt_cache_seq_show(struct seq_file *seq, void *v)
 224{
 225        if (v == SEQ_START_TOKEN)
 226                seq_printf(seq, "%-127s\n",
 227                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 228                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 229                           "HHUptod\tSpecDst");
 230        return 0;
 231}
 232
 233static const struct seq_operations rt_cache_seq_ops = {
 234        .start  = rt_cache_seq_start,
 235        .next   = rt_cache_seq_next,
 236        .stop   = rt_cache_seq_stop,
 237        .show   = rt_cache_seq_show,
 238};
 239
 240static int rt_cache_seq_open(struct inode *inode, struct file *file)
 241{
 242        return seq_open(file, &rt_cache_seq_ops);
 243}
 244
 245static const struct file_operations rt_cache_seq_fops = {
 246        .owner   = THIS_MODULE,
 247        .open    = rt_cache_seq_open,
 248        .read    = seq_read,
 249        .llseek  = seq_lseek,
 250        .release = seq_release,
 251};
 252
 253
 254static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 255{
 256        int cpu;
 257
 258        if (*pos == 0)
 259                return SEQ_START_TOKEN;
 260
 261        for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 262                if (!cpu_possible(cpu))
 263                        continue;
 264                *pos = cpu+1;
 265                return &per_cpu(rt_cache_stat, cpu);
 266        }
 267        return NULL;
 268}
 269
 270static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 271{
 272        int cpu;
 273
 274        for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 275                if (!cpu_possible(cpu))
 276                        continue;
 277                *pos = cpu+1;
 278                return &per_cpu(rt_cache_stat, cpu);
 279        }
 280        return NULL;
 281
 282}
 283
 284static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 285{
 286
 287}
 288
 289static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 290{
 291        struct rt_cache_stat *st = v;
 292
 293        if (v == SEQ_START_TOKEN) {
 294                seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 295                return 0;
 296        }
 297
 298        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 299                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 300                   dst_entries_get_slow(&ipv4_dst_ops),
 301                   st->in_hit,
 302                   st->in_slow_tot,
 303                   st->in_slow_mc,
 304                   st->in_no_route,
 305                   st->in_brd,
 306                   st->in_martian_dst,
 307                   st->in_martian_src,
 308
 309                   st->out_hit,
 310                   st->out_slow_tot,
 311                   st->out_slow_mc,
 312
 313                   st->gc_total,
 314                   st->gc_ignored,
 315                   st->gc_goal_miss,
 316                   st->gc_dst_overflow,
 317                   st->in_hlist_search,
 318                   st->out_hlist_search
 319                );
 320        return 0;
 321}
 322
 323static const struct seq_operations rt_cpu_seq_ops = {
 324        .start  = rt_cpu_seq_start,
 325        .next   = rt_cpu_seq_next,
 326        .stop   = rt_cpu_seq_stop,
 327        .show   = rt_cpu_seq_show,
 328};
 329
 330
 331static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 332{
 333        return seq_open(file, &rt_cpu_seq_ops);
 334}
 335
 336static const struct file_operations rt_cpu_seq_fops = {
 337        .owner   = THIS_MODULE,
 338        .open    = rt_cpu_seq_open,
 339        .read    = seq_read,
 340        .llseek  = seq_lseek,
 341        .release = seq_release,
 342};
 343
 344#ifdef CONFIG_IP_ROUTE_CLASSID
 345static int rt_acct_proc_show(struct seq_file *m, void *v)
 346{
 347        struct ip_rt_acct *dst, *src;
 348        unsigned int i, j;
 349
 350        dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 351        if (!dst)
 352                return -ENOMEM;
 353
 354        for_each_possible_cpu(i) {
 355                src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 356                for (j = 0; j < 256; j++) {
 357                        dst[j].o_bytes   += src[j].o_bytes;
 358                        dst[j].o_packets += src[j].o_packets;
 359                        dst[j].i_bytes   += src[j].i_bytes;
 360                        dst[j].i_packets += src[j].i_packets;
 361                }
 362        }
 363
 364        seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 365        kfree(dst);
 366        return 0;
 367}
 368
 369static int rt_acct_proc_open(struct inode *inode, struct file *file)
 370{
 371        return single_open(file, rt_acct_proc_show, NULL);
 372}
 373
 374static const struct file_operations rt_acct_proc_fops = {
 375        .owner          = THIS_MODULE,
 376        .open           = rt_acct_proc_open,
 377        .read           = seq_read,
 378        .llseek         = seq_lseek,
 379        .release        = single_release,
 380};
 381#endif
 382
 383static int __net_init ip_rt_do_proc_init(struct net *net)
 384{
 385        struct proc_dir_entry *pde;
 386
 387        pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 388                        &rt_cache_seq_fops);
 389        if (!pde)
 390                goto err1;
 391
 392        pde = proc_create("rt_cache", S_IRUGO,
 393                          net->proc_net_stat, &rt_cpu_seq_fops);
 394        if (!pde)
 395                goto err2;
 396
 397#ifdef CONFIG_IP_ROUTE_CLASSID
 398        pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 399        if (!pde)
 400                goto err3;
 401#endif
 402        return 0;
 403
 404#ifdef CONFIG_IP_ROUTE_CLASSID
 405err3:
 406        remove_proc_entry("rt_cache", net->proc_net_stat);
 407#endif
 408err2:
 409        remove_proc_entry("rt_cache", net->proc_net);
 410err1:
 411        return -ENOMEM;
 412}
 413
 414static void __net_exit ip_rt_do_proc_exit(struct net *net)
 415{
 416        remove_proc_entry("rt_cache", net->proc_net_stat);
 417        remove_proc_entry("rt_cache", net->proc_net);
 418#ifdef CONFIG_IP_ROUTE_CLASSID
 419        remove_proc_entry("rt_acct", net->proc_net);
 420#endif
 421}
 422
 423static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 424        .init = ip_rt_do_proc_init,
 425        .exit = ip_rt_do_proc_exit,
 426};
 427
 428static int __init ip_rt_proc_init(void)
 429{
 430        return register_pernet_subsys(&ip_rt_proc_ops);
 431}
 432
 433#else
 434static inline int ip_rt_proc_init(void)
 435{
 436        return 0;
 437}
 438#endif /* CONFIG_PROC_FS */
 439
 440static inline bool rt_is_expired(const struct rtable *rth)
 441{
 442        return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 443}
 444
 445void rt_cache_flush(struct net *net)
 446{
 447        rt_genid_bump(net);
 448}
 449
 450static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 451                                           struct sk_buff *skb,
 452                                           const void *daddr)
 453{
 454        struct net_device *dev = dst->dev;
 455        const __be32 *pkey = daddr;
 456        const struct rtable *rt;
 457        struct neighbour *n;
 458
 459        rt = (const struct rtable *) dst;
 460        if (rt->rt_gateway)
 461                pkey = (const __be32 *) &rt->rt_gateway;
 462        else if (skb)
 463                pkey = &ip_hdr(skb)->daddr;
 464
 465        n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 466        if (n)
 467                return n;
 468        return neigh_create(&arp_tbl, pkey, dev);
 469}
 470
 471/*
 472 * Peer allocation may fail only in serious out-of-memory conditions.  However
 473 * we still can generate some output.
 474 * Random ID selection looks a bit dangerous because we have no chances to
 475 * select ID being unique in a reasonable period of time.
 476 * But broken packet identifier may be better than no packet at all.
 477 */
 478static void ip_select_fb_ident(struct iphdr *iph)
 479{
 480        static DEFINE_SPINLOCK(ip_fb_id_lock);
 481        static u32 ip_fallback_id;
 482        u32 salt;
 483
 484        spin_lock_bh(&ip_fb_id_lock);
 485        salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
 486        iph->id = htons(salt & 0xFFFF);
 487        ip_fallback_id = salt;
 488        spin_unlock_bh(&ip_fb_id_lock);
 489}
 490
 491void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 492{
 493        struct net *net = dev_net(dst->dev);
 494        struct inet_peer *peer;
 495
 496        peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
 497        if (peer) {
 498                iph->id = htons(inet_getid(peer, more));
 499                inet_putpeer(peer);
 500                return;
 501        }
 502
 503        ip_select_fb_ident(iph);
 504}
 505EXPORT_SYMBOL(__ip_select_ident);
 506
 507static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 508                             const struct iphdr *iph,
 509                             int oif, u8 tos,
 510                             u8 prot, u32 mark, int flow_flags)
 511{
 512        if (sk) {
 513                const struct inet_sock *inet = inet_sk(sk);
 514
 515                oif = sk->sk_bound_dev_if;
 516                mark = sk->sk_mark;
 517                tos = RT_CONN_FLAGS(sk);
 518                prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 519        }
 520        flowi4_init_output(fl4, oif, mark, tos,
 521                           RT_SCOPE_UNIVERSE, prot,
 522                           flow_flags,
 523                           iph->daddr, iph->saddr, 0, 0);
 524}
 525
 526static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 527                               const struct sock *sk)
 528{
 529        const struct iphdr *iph = ip_hdr(skb);
 530        int oif = skb->dev->ifindex;
 531        u8 tos = RT_TOS(iph->tos);
 532        u8 prot = iph->protocol;
 533        u32 mark = skb->mark;
 534
 535        __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 536}
 537
 538static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 539{
 540        const struct inet_sock *inet = inet_sk(sk);
 541        const struct ip_options_rcu *inet_opt;
 542        __be32 daddr = inet->inet_daddr;
 543
 544        rcu_read_lock();
 545        inet_opt = rcu_dereference(inet->inet_opt);
 546        if (inet_opt && inet_opt->opt.srr)
 547                daddr = inet_opt->opt.faddr;
 548        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 549                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 550                           inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 551                           inet_sk_flowi_flags(sk),
 552                           daddr, inet->inet_saddr, 0, 0);
 553        rcu_read_unlock();
 554}
 555
 556static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 557                                 const struct sk_buff *skb)
 558{
 559        if (skb)
 560                build_skb_flow_key(fl4, skb, sk);
 561        else
 562                build_sk_flow_key(fl4, sk);
 563}
 564
 565static inline void rt_free(struct rtable *rt)
 566{
 567        call_rcu(&rt->dst.rcu_head, dst_rcu_free);
 568}
 569
 570static DEFINE_SPINLOCK(fnhe_lock);
 571
 572static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 573{
 574        struct fib_nh_exception *fnhe, *oldest;
 575        struct rtable *orig;
 576
 577        oldest = rcu_dereference(hash->chain);
 578        for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 579             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 580                if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 581                        oldest = fnhe;
 582        }
 583        orig = rcu_dereference(oldest->fnhe_rth);
 584        if (orig) {
 585                RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
 586                rt_free(orig);
 587        }
 588        return oldest;
 589}
 590
 591static inline u32 fnhe_hashfun(__be32 daddr)
 592{
 593        u32 hval;
 594
 595        hval = (__force u32) daddr;
 596        hval ^= (hval >> 11) ^ (hval >> 22);
 597
 598        return hval & (FNHE_HASH_SIZE - 1);
 599}
 600
 601static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 602                                  u32 pmtu, unsigned long expires)
 603{
 604        struct fnhe_hash_bucket *hash;
 605        struct fib_nh_exception *fnhe;
 606        int depth;
 607        u32 hval = fnhe_hashfun(daddr);
 608
 609        spin_lock_bh(&fnhe_lock);
 610
 611        hash = nh->nh_exceptions;
 612        if (!hash) {
 613                hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 614                if (!hash)
 615                        goto out_unlock;
 616                nh->nh_exceptions = hash;
 617        }
 618
 619        hash += hval;
 620
 621        depth = 0;
 622        for (fnhe = rcu_dereference(hash->chain); fnhe;
 623             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 624                if (fnhe->fnhe_daddr == daddr)
 625                        break;
 626                depth++;
 627        }
 628
 629        if (fnhe) {
 630                if (gw)
 631                        fnhe->fnhe_gw = gw;
 632                if (pmtu) {
 633                        fnhe->fnhe_pmtu = pmtu;
 634                        fnhe->fnhe_expires = expires;
 635                }
 636        } else {
 637                if (depth > FNHE_RECLAIM_DEPTH)
 638                        fnhe = fnhe_oldest(hash);
 639                else {
 640                        fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 641                        if (!fnhe)
 642                                goto out_unlock;
 643
 644                        fnhe->fnhe_next = hash->chain;
 645                        rcu_assign_pointer(hash->chain, fnhe);
 646                }
 647                fnhe->fnhe_daddr = daddr;
 648                fnhe->fnhe_gw = gw;
 649                fnhe->fnhe_pmtu = pmtu;
 650                fnhe->fnhe_expires = expires;
 651        }
 652
 653        fnhe->fnhe_stamp = jiffies;
 654
 655out_unlock:
 656        spin_unlock_bh(&fnhe_lock);
 657        return;
 658}
 659
 660static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 661                             bool kill_route)
 662{
 663        __be32 new_gw = icmp_hdr(skb)->un.gateway;
 664        __be32 old_gw = ip_hdr(skb)->saddr;
 665        struct net_device *dev = skb->dev;
 666        struct in_device *in_dev;
 667        struct fib_result res;
 668        struct neighbour *n;
 669        struct net *net;
 670
 671        switch (icmp_hdr(skb)->code & 7) {
 672        case ICMP_REDIR_NET:
 673        case ICMP_REDIR_NETTOS:
 674        case ICMP_REDIR_HOST:
 675        case ICMP_REDIR_HOSTTOS:
 676                break;
 677
 678        default:
 679                return;
 680        }
 681
 682        if (rt->rt_gateway != old_gw)
 683                return;
 684
 685        in_dev = __in_dev_get_rcu(dev);
 686        if (!in_dev)
 687                return;
 688
 689        net = dev_net(dev);
 690        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 691            ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 692            ipv4_is_zeronet(new_gw))
 693                goto reject_redirect;
 694
 695        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 696                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 697                        goto reject_redirect;
 698                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 699                        goto reject_redirect;
 700        } else {
 701                if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 702                        goto reject_redirect;
 703        }
 704
 705        n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
 706        if (n) {
 707                if (!(n->nud_state & NUD_VALID)) {
 708                        neigh_event_send(n, NULL);
 709                } else {
 710                        if (fib_lookup(net, fl4, &res) == 0) {
 711                                struct fib_nh *nh = &FIB_RES_NH(res);
 712
 713                                update_or_create_fnhe(nh, fl4->daddr, new_gw,
 714                                                      0, 0);
 715                        }
 716                        if (kill_route)
 717                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 718                        call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 719                }
 720                neigh_release(n);
 721        }
 722        return;
 723
 724reject_redirect:
 725#ifdef CONFIG_IP_ROUTE_VERBOSE
 726        if (IN_DEV_LOG_MARTIANS(in_dev)) {
 727                const struct iphdr *iph = (const struct iphdr *) skb->data;
 728                __be32 daddr = iph->daddr;
 729                __be32 saddr = iph->saddr;
 730
 731                net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 732                                     "  Advised path = %pI4 -> %pI4\n",
 733                                     &old_gw, dev->name, &new_gw,
 734                                     &saddr, &daddr);
 735        }
 736#endif
 737        ;
 738}
 739
 740static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 741{
 742        struct rtable *rt;
 743        struct flowi4 fl4;
 744
 745        rt = (struct rtable *) dst;
 746
 747        ip_rt_build_flow_key(&fl4, sk, skb);
 748        __ip_do_redirect(rt, skb, &fl4, true);
 749}
 750
 751static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 752{
 753        struct rtable *rt = (struct rtable *)dst;
 754        struct dst_entry *ret = dst;
 755
 756        if (rt) {
 757                if (dst->obsolete > 0) {
 758                        ip_rt_put(rt);
 759                        ret = NULL;
 760                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 761                           rt->dst.expires) {
 762                        ip_rt_put(rt);
 763                        ret = NULL;
 764                }
 765        }
 766        return ret;
 767}
 768
 769/*
 770 * Algorithm:
 771 *      1. The first ip_rt_redirect_number redirects are sent
 772 *         with exponential backoff, then we stop sending them at all,
 773 *         assuming that the host ignores our redirects.
 774 *      2. If we did not see packets requiring redirects
 775 *         during ip_rt_redirect_silence, we assume that the host
 776 *         forgot redirected route and start to send redirects again.
 777 *
 778 * This algorithm is much cheaper and more intelligent than dumb load limiting
 779 * in icmp.c.
 780 *
 781 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 782 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 783 */
 784
 785void ip_rt_send_redirect(struct sk_buff *skb)
 786{
 787        struct rtable *rt = skb_rtable(skb);
 788        struct in_device *in_dev;
 789        struct inet_peer *peer;
 790        struct net *net;
 791        int log_martians;
 792
 793        rcu_read_lock();
 794        in_dev = __in_dev_get_rcu(rt->dst.dev);
 795        if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 796                rcu_read_unlock();
 797                return;
 798        }
 799        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 800        rcu_read_unlock();
 801
 802        net = dev_net(rt->dst.dev);
 803        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 804        if (!peer) {
 805                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 806                          rt_nexthop(rt, ip_hdr(skb)->daddr));
 807                return;
 808        }
 809
 810        /* No redirected packets during ip_rt_redirect_silence;
 811         * reset the algorithm.
 812         */
 813        if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 814                peer->rate_tokens = 0;
 815
 816        /* Too many ignored redirects; do not send anything
 817         * set dst.rate_last to the last seen redirected packet.
 818         */
 819        if (peer->rate_tokens >= ip_rt_redirect_number) {
 820                peer->rate_last = jiffies;
 821                goto out_put_peer;
 822        }
 823
 824        /* Check for load limit; set rate_last to the latest sent
 825         * redirect.
 826         */
 827        if (peer->rate_tokens == 0 ||
 828            time_after(jiffies,
 829                       (peer->rate_last +
 830                        (ip_rt_redirect_load << peer->rate_tokens)))) {
 831                __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 832
 833                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 834                peer->rate_last = jiffies;
 835                ++peer->rate_tokens;
 836#ifdef CONFIG_IP_ROUTE_VERBOSE
 837                if (log_martians &&
 838                    peer->rate_tokens == ip_rt_redirect_number)
 839                        net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 840                                             &ip_hdr(skb)->saddr, inet_iif(skb),
 841                                             &ip_hdr(skb)->daddr, &gw);
 842#endif
 843        }
 844out_put_peer:
 845        inet_putpeer(peer);
 846}
 847
 848static int ip_error(struct sk_buff *skb)
 849{
 850        struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 851        struct rtable *rt = skb_rtable(skb);
 852        struct inet_peer *peer;
 853        unsigned long now;
 854        struct net *net;
 855        bool send;
 856        int code;
 857
 858        net = dev_net(rt->dst.dev);
 859        if (!IN_DEV_FORWARD(in_dev)) {
 860                switch (rt->dst.error) {
 861                case EHOSTUNREACH:
 862                        IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
 863                        break;
 864
 865                case ENETUNREACH:
 866                        IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 867                        break;
 868                }
 869                goto out;
 870        }
 871
 872        switch (rt->dst.error) {
 873        case EINVAL:
 874        default:
 875                goto out;
 876        case EHOSTUNREACH:
 877                code = ICMP_HOST_UNREACH;
 878                break;
 879        case ENETUNREACH:
 880                code = ICMP_NET_UNREACH;
 881                IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 882                break;
 883        case EACCES:
 884                code = ICMP_PKT_FILTERED;
 885                break;
 886        }
 887
 888        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 889
 890        send = true;
 891        if (peer) {
 892                now = jiffies;
 893                peer->rate_tokens += now - peer->rate_last;
 894                if (peer->rate_tokens > ip_rt_error_burst)
 895                        peer->rate_tokens = ip_rt_error_burst;
 896                peer->rate_last = now;
 897                if (peer->rate_tokens >= ip_rt_error_cost)
 898                        peer->rate_tokens -= ip_rt_error_cost;
 899                else
 900                        send = false;
 901                inet_putpeer(peer);
 902        }
 903        if (send)
 904                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 905
 906out:    kfree_skb(skb);
 907        return 0;
 908}
 909
 910static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 911{
 912        struct dst_entry *dst = &rt->dst;
 913        struct fib_result res;
 914
 915        if (dst->dev->mtu < mtu)
 916                return;
 917
 918        if (mtu < ip_rt_min_pmtu)
 919                mtu = ip_rt_min_pmtu;
 920
 921        if (!rt->rt_pmtu) {
 922                dst->obsolete = DST_OBSOLETE_KILL;
 923        } else {
 924                rt->rt_pmtu = mtu;
 925                dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
 926        }
 927
 928        rcu_read_lock();
 929        if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
 930                struct fib_nh *nh = &FIB_RES_NH(res);
 931
 932                update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
 933                                      jiffies + ip_rt_mtu_expires);
 934        }
 935        rcu_read_unlock();
 936}
 937
 938static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 939                              struct sk_buff *skb, u32 mtu)
 940{
 941        struct rtable *rt = (struct rtable *) dst;
 942        struct flowi4 fl4;
 943
 944        ip_rt_build_flow_key(&fl4, sk, skb);
 945        __ip_rt_update_pmtu(rt, &fl4, mtu);
 946}
 947
 948void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
 949                      int oif, u32 mark, u8 protocol, int flow_flags)
 950{
 951        const struct iphdr *iph = (const struct iphdr *) skb->data;
 952        struct flowi4 fl4;
 953        struct rtable *rt;
 954
 955        __build_flow_key(&fl4, NULL, iph, oif,
 956                         RT_TOS(iph->tos), protocol, mark, flow_flags);
 957        rt = __ip_route_output_key(net, &fl4);
 958        if (!IS_ERR(rt)) {
 959                __ip_rt_update_pmtu(rt, &fl4, mtu);
 960                ip_rt_put(rt);
 961        }
 962}
 963EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
 964
 965void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 966{
 967        const struct iphdr *iph = (const struct iphdr *) skb->data;
 968        struct flowi4 fl4;
 969        struct rtable *rt;
 970
 971        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
 972        rt = __ip_route_output_key(sock_net(sk), &fl4);
 973        if (!IS_ERR(rt)) {
 974                __ip_rt_update_pmtu(rt, &fl4, mtu);
 975                ip_rt_put(rt);
 976        }
 977}
 978EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
 979
 980void ipv4_redirect(struct sk_buff *skb, struct net *net,
 981                   int oif, u32 mark, u8 protocol, int flow_flags)
 982{
 983        const struct iphdr *iph = (const struct iphdr *) skb->data;
 984        struct flowi4 fl4;
 985        struct rtable *rt;
 986
 987        __build_flow_key(&fl4, NULL, iph, oif,
 988                         RT_TOS(iph->tos), protocol, mark, flow_flags);
 989        rt = __ip_route_output_key(net, &fl4);
 990        if (!IS_ERR(rt)) {
 991                __ip_do_redirect(rt, skb, &fl4, false);
 992                ip_rt_put(rt);
 993        }
 994}
 995EXPORT_SYMBOL_GPL(ipv4_redirect);
 996
 997void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
 998{
 999        const struct iphdr *iph = (const struct iphdr *) skb->data;
1000        struct flowi4 fl4;
1001        struct rtable *rt;
1002
1003        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1004        rt = __ip_route_output_key(sock_net(sk), &fl4);
1005        if (!IS_ERR(rt)) {
1006                __ip_do_redirect(rt, skb, &fl4, false);
1007                ip_rt_put(rt);
1008        }
1009}
1010EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1011
1012static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1013{
1014        struct rtable *rt = (struct rtable *) dst;
1015
1016        /* All IPV4 dsts are created with ->obsolete set to the value
1017         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1018         * into this function always.
1019         *
1020         * When a PMTU/redirect information update invalidates a
1021         * route, this is indicated by setting obsolete to
1022         * DST_OBSOLETE_KILL.
1023         */
1024        if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1025                return NULL;
1026        return dst;
1027}
1028
1029static void ipv4_link_failure(struct sk_buff *skb)
1030{
1031        struct rtable *rt;
1032
1033        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1034
1035        rt = skb_rtable(skb);
1036        if (rt)
1037                dst_set_expires(&rt->dst, 0);
1038}
1039
1040static int ip_rt_bug(struct sk_buff *skb)
1041{
1042        pr_debug("%s: %pI4 -> %pI4, %s\n",
1043                 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1044                 skb->dev ? skb->dev->name : "?");
1045        kfree_skb(skb);
1046        WARN_ON(1);
1047        return 0;
1048}
1049
1050/*
1051   We do not cache source address of outgoing interface,
1052   because it is used only by IP RR, TS and SRR options,
1053   so that it out of fast path.
1054
1055   BTW remember: "addr" is allowed to be not aligned
1056   in IP options!
1057 */
1058
1059void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1060{
1061        __be32 src;
1062
1063        if (rt_is_output_route(rt))
1064                src = ip_hdr(skb)->saddr;
1065        else {
1066                struct fib_result res;
1067                struct flowi4 fl4;
1068                struct iphdr *iph;
1069
1070                iph = ip_hdr(skb);
1071
1072                memset(&fl4, 0, sizeof(fl4));
1073                fl4.daddr = iph->daddr;
1074                fl4.saddr = iph->saddr;
1075                fl4.flowi4_tos = RT_TOS(iph->tos);
1076                fl4.flowi4_oif = rt->dst.dev->ifindex;
1077                fl4.flowi4_iif = skb->dev->ifindex;
1078                fl4.flowi4_mark = skb->mark;
1079
1080                rcu_read_lock();
1081                if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1082                        src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1083                else
1084                        src = inet_select_addr(rt->dst.dev,
1085                                               rt_nexthop(rt, iph->daddr),
1086                                               RT_SCOPE_UNIVERSE);
1087                rcu_read_unlock();
1088        }
1089        memcpy(addr, &src, 4);
1090}
1091
1092#ifdef CONFIG_IP_ROUTE_CLASSID
1093static void set_class_tag(struct rtable *rt, u32 tag)
1094{
1095        if (!(rt->dst.tclassid & 0xFFFF))
1096                rt->dst.tclassid |= tag & 0xFFFF;
1097        if (!(rt->dst.tclassid & 0xFFFF0000))
1098                rt->dst.tclassid |= tag & 0xFFFF0000;
1099}
1100#endif
1101
1102static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1103{
1104        unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1105
1106        if (advmss == 0) {
1107                advmss = max_t(unsigned int, dst->dev->mtu - 40,
1108                               ip_rt_min_advmss);
1109                if (advmss > 65535 - 40)
1110                        advmss = 65535 - 40;
1111        }
1112        return advmss;
1113}
1114
1115static unsigned int ipv4_mtu(const struct dst_entry *dst)
1116{
1117        const struct rtable *rt = (const struct rtable *) dst;
1118        unsigned int mtu = rt->rt_pmtu;
1119
1120        if (mtu && time_after_eq(jiffies, rt->dst.expires))
1121                mtu = 0;
1122
1123        if (!mtu)
1124                mtu = dst_metric_raw(dst, RTAX_MTU);
1125
1126        if (mtu && rt_is_output_route(rt))
1127                return mtu;
1128
1129        mtu = dst->dev->mtu;
1130
1131        if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1132                if (rt->rt_uses_gateway && mtu > 576)
1133                        mtu = 576;
1134        }
1135
1136        if (mtu > IP_MAX_MTU)
1137                mtu = IP_MAX_MTU;
1138
1139        return mtu;
1140}
1141
1142static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1143{
1144        struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1145        struct fib_nh_exception *fnhe;
1146        u32 hval;
1147
1148        if (!hash)
1149                return NULL;
1150
1151        hval = fnhe_hashfun(daddr);
1152
1153        for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1154             fnhe = rcu_dereference(fnhe->fnhe_next)) {
1155                if (fnhe->fnhe_daddr == daddr)
1156                        return fnhe;
1157        }
1158        return NULL;
1159}
1160
1161static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1162                              __be32 daddr)
1163{
1164        bool ret = false;
1165
1166        spin_lock_bh(&fnhe_lock);
1167
1168        if (daddr == fnhe->fnhe_daddr) {
1169                struct rtable *orig;
1170
1171                if (fnhe->fnhe_pmtu) {
1172                        unsigned long expires = fnhe->fnhe_expires;
1173                        unsigned long diff = expires - jiffies;
1174
1175                        if (time_before(jiffies, expires)) {
1176                                rt->rt_pmtu = fnhe->fnhe_pmtu;
1177                                dst_set_expires(&rt->dst, diff);
1178                        }
1179                }
1180                if (fnhe->fnhe_gw) {
1181                        rt->rt_flags |= RTCF_REDIRECTED;
1182                        rt->rt_gateway = fnhe->fnhe_gw;
1183                        rt->rt_uses_gateway = 1;
1184                } else if (!rt->rt_gateway)
1185                        rt->rt_gateway = daddr;
1186
1187                orig = rcu_dereference(fnhe->fnhe_rth);
1188                rcu_assign_pointer(fnhe->fnhe_rth, rt);
1189                if (orig)
1190                        rt_free(orig);
1191
1192                fnhe->fnhe_stamp = jiffies;
1193                ret = true;
1194        }
1195        spin_unlock_bh(&fnhe_lock);
1196
1197        return ret;
1198}
1199
1200static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1201{
1202        struct rtable *orig, *prev, **p;
1203        bool ret = true;
1204
1205        if (rt_is_input_route(rt)) {
1206                p = (struct rtable **)&nh->nh_rth_input;
1207        } else {
1208                p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1209        }
1210        orig = *p;
1211
1212        prev = cmpxchg(p, orig, rt);
1213        if (prev == orig) {
1214                if (orig)
1215                        rt_free(orig);
1216        } else
1217                ret = false;
1218
1219        return ret;
1220}
1221
1222static DEFINE_SPINLOCK(rt_uncached_lock);
1223static LIST_HEAD(rt_uncached_list);
1224
1225static void rt_add_uncached_list(struct rtable *rt)
1226{
1227        spin_lock_bh(&rt_uncached_lock);
1228        list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1229        spin_unlock_bh(&rt_uncached_lock);
1230}
1231
1232static void ipv4_dst_destroy(struct dst_entry *dst)
1233{
1234        struct rtable *rt = (struct rtable *) dst;
1235
1236        if (!list_empty(&rt->rt_uncached)) {
1237                spin_lock_bh(&rt_uncached_lock);
1238                list_del(&rt->rt_uncached);
1239                spin_unlock_bh(&rt_uncached_lock);
1240        }
1241}
1242
1243void rt_flush_dev(struct net_device *dev)
1244{
1245        if (!list_empty(&rt_uncached_list)) {
1246                struct net *net = dev_net(dev);
1247                struct rtable *rt;
1248
1249                spin_lock_bh(&rt_uncached_lock);
1250                list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1251                        if (rt->dst.dev != dev)
1252                                continue;
1253                        rt->dst.dev = net->loopback_dev;
1254                        dev_hold(rt->dst.dev);
1255                        dev_put(dev);
1256                }
1257                spin_unlock_bh(&rt_uncached_lock);
1258        }
1259}
1260
1261static bool rt_cache_valid(const struct rtable *rt)
1262{
1263        return  rt &&
1264                rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1265                !rt_is_expired(rt);
1266}
1267
1268static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1269                           const struct fib_result *res,
1270                           struct fib_nh_exception *fnhe,
1271                           struct fib_info *fi, u16 type, u32 itag)
1272{
1273        bool cached = false;
1274
1275        if (fi) {
1276                struct fib_nh *nh = &FIB_RES_NH(*res);
1277
1278                if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1279                        rt->rt_gateway = nh->nh_gw;
1280                        rt->rt_uses_gateway = 1;
1281                }
1282                dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1283#ifdef CONFIG_IP_ROUTE_CLASSID
1284                rt->dst.tclassid = nh->nh_tclassid;
1285#endif
1286                if (unlikely(fnhe))
1287                        cached = rt_bind_exception(rt, fnhe, daddr);
1288                else if (!(rt->dst.flags & DST_NOCACHE))
1289                        cached = rt_cache_route(nh, rt);
1290                if (unlikely(!cached)) {
1291                        /* Routes we intend to cache in nexthop exception or
1292                         * FIB nexthop have the DST_NOCACHE bit clear.
1293                         * However, if we are unsuccessful at storing this
1294                         * route into the cache we really need to set it.
1295                         */
1296                        rt->dst.flags |= DST_NOCACHE;
1297                        if (!rt->rt_gateway)
1298                                rt->rt_gateway = daddr;
1299                        rt_add_uncached_list(rt);
1300                }
1301        } else
1302                rt_add_uncached_list(rt);
1303
1304#ifdef CONFIG_IP_ROUTE_CLASSID
1305#ifdef CONFIG_IP_MULTIPLE_TABLES
1306        set_class_tag(rt, res->tclassid);
1307#endif
1308        set_class_tag(rt, itag);
1309#endif
1310}
1311
1312static struct rtable *rt_dst_alloc(struct net_device *dev,
1313                                   bool nopolicy, bool noxfrm, bool will_cache)
1314{
1315        return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1316                         (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1317                         (nopolicy ? DST_NOPOLICY : 0) |
1318                         (noxfrm ? DST_NOXFRM : 0));
1319}
1320
1321/* called in rcu_read_lock() section */
1322static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1323                                u8 tos, struct net_device *dev, int our)
1324{
1325        struct rtable *rth;
1326        struct in_device *in_dev = __in_dev_get_rcu(dev);
1327        u32 itag = 0;
1328        int err;
1329
1330        /* Primary sanity checks. */
1331
1332        if (in_dev == NULL)
1333                return -EINVAL;
1334
1335        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1336            skb->protocol != htons(ETH_P_IP))
1337                goto e_inval;
1338
1339        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1340                if (ipv4_is_loopback(saddr))
1341                        goto e_inval;
1342
1343        if (ipv4_is_zeronet(saddr)) {
1344                if (!ipv4_is_local_multicast(daddr))
1345                        goto e_inval;
1346        } else {
1347                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1348                                          in_dev, &itag);
1349                if (err < 0)
1350                        goto e_err;
1351        }
1352        rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1353                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1354        if (!rth)
1355                goto e_nobufs;
1356
1357#ifdef CONFIG_IP_ROUTE_CLASSID
1358        rth->dst.tclassid = itag;
1359#endif
1360        rth->dst.output = ip_rt_bug;
1361
1362        rth->rt_genid   = rt_genid(dev_net(dev));
1363        rth->rt_flags   = RTCF_MULTICAST;
1364        rth->rt_type    = RTN_MULTICAST;
1365        rth->rt_is_input= 1;
1366        rth->rt_iif     = 0;
1367        rth->rt_pmtu    = 0;
1368        rth->rt_gateway = 0;
1369        rth->rt_uses_gateway = 0;
1370        INIT_LIST_HEAD(&rth->rt_uncached);
1371        if (our) {
1372                rth->dst.input= ip_local_deliver;
1373                rth->rt_flags |= RTCF_LOCAL;
1374        }
1375
1376#ifdef CONFIG_IP_MROUTE
1377        if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1378                rth->dst.input = ip_mr_input;
1379#endif
1380        RT_CACHE_STAT_INC(in_slow_mc);
1381
1382        skb_dst_set(skb, &rth->dst);
1383        return 0;
1384
1385e_nobufs:
1386        return -ENOBUFS;
1387e_inval:
1388        return -EINVAL;
1389e_err:
1390        return err;
1391}
1392
1393
1394static void ip_handle_martian_source(struct net_device *dev,
1395                                     struct in_device *in_dev,
1396                                     struct sk_buff *skb,
1397                                     __be32 daddr,
1398                                     __be32 saddr)
1399{
1400        RT_CACHE_STAT_INC(in_martian_src);
1401#ifdef CONFIG_IP_ROUTE_VERBOSE
1402        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1403                /*
1404                 *      RFC1812 recommendation, if source is martian,
1405                 *      the only hint is MAC header.
1406                 */
1407                pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1408                        &daddr, &saddr, dev->name);
1409                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1410                        print_hex_dump(KERN_WARNING, "ll header: ",
1411                                       DUMP_PREFIX_OFFSET, 16, 1,
1412                                       skb_mac_header(skb),
1413                                       dev->hard_header_len, true);
1414                }
1415        }
1416#endif
1417}
1418
1419/* called in rcu_read_lock() section */
1420static int __mkroute_input(struct sk_buff *skb,
1421                           const struct fib_result *res,
1422                           struct in_device *in_dev,
1423                           __be32 daddr, __be32 saddr, u32 tos)
1424{
1425        struct rtable *rth;
1426        int err;
1427        struct in_device *out_dev;
1428        unsigned int flags = 0;
1429        bool do_cache;
1430        u32 itag;
1431
1432        /* get a working reference to the output device */
1433        out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1434        if (out_dev == NULL) {
1435                net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1436                return -EINVAL;
1437        }
1438
1439        err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1440                                  in_dev->dev, in_dev, &itag);
1441        if (err < 0) {
1442                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1443                                         saddr);
1444
1445                goto cleanup;
1446        }
1447
1448        do_cache = res->fi && !itag;
1449        if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1450            (IN_DEV_SHARED_MEDIA(out_dev) ||
1451             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1452                flags |= RTCF_DOREDIRECT;
1453                do_cache = false;
1454        }
1455
1456        if (skb->protocol != htons(ETH_P_IP)) {
1457                /* Not IP (i.e. ARP). Do not create route, if it is
1458                 * invalid for proxy arp. DNAT routes are always valid.
1459                 *
1460                 * Proxy arp feature have been extended to allow, ARP
1461                 * replies back to the same interface, to support
1462                 * Private VLAN switch technologies. See arp.c.
1463                 */
1464                if (out_dev == in_dev &&
1465                    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1466                        err = -EINVAL;
1467                        goto cleanup;
1468                }
1469        }
1470
1471        if (do_cache) {
1472                rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1473                if (rt_cache_valid(rth)) {
1474                        skb_dst_set_noref(skb, &rth->dst);
1475                        goto out;
1476                }
1477        }
1478
1479        rth = rt_dst_alloc(out_dev->dev,
1480                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
1481                           IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1482        if (!rth) {
1483                err = -ENOBUFS;
1484                goto cleanup;
1485        }
1486
1487        rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1488        rth->rt_flags = flags;
1489        rth->rt_type = res->type;
1490        rth->rt_is_input = 1;
1491        rth->rt_iif     = 0;
1492        rth->rt_pmtu    = 0;
1493        rth->rt_gateway = 0;
1494        rth->rt_uses_gateway = 0;
1495        INIT_LIST_HEAD(&rth->rt_uncached);
1496
1497        rth->dst.input = ip_forward;
1498        rth->dst.output = ip_output;
1499
1500        rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1501        skb_dst_set(skb, &rth->dst);
1502out:
1503        err = 0;
1504 cleanup:
1505        return err;
1506}
1507
1508static int ip_mkroute_input(struct sk_buff *skb,
1509                            struct fib_result *res,
1510                            const struct flowi4 *fl4,
1511                            struct in_device *in_dev,
1512                            __be32 daddr, __be32 saddr, u32 tos)
1513{
1514#ifdef CONFIG_IP_ROUTE_MULTIPATH
1515        if (res->fi && res->fi->fib_nhs > 1)
1516                fib_select_multipath(res);
1517#endif
1518
1519        /* create a routing cache entry */
1520        return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1521}
1522
1523/*
1524 *      NOTE. We drop all the packets that has local source
1525 *      addresses, because every properly looped back packet
1526 *      must have correct destination already attached by output routine.
1527 *
1528 *      Such approach solves two big problems:
1529 *      1. Not simplex devices are handled properly.
1530 *      2. IP spoofing attempts are filtered with 100% of guarantee.
1531 *      called with rcu_read_lock()
1532 */
1533
1534static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1535                               u8 tos, struct net_device *dev)
1536{
1537        struct fib_result res;
1538        struct in_device *in_dev = __in_dev_get_rcu(dev);
1539        struct flowi4   fl4;
1540        unsigned int    flags = 0;
1541        u32             itag = 0;
1542        struct rtable   *rth;
1543        int             err = -EINVAL;
1544        struct net    *net = dev_net(dev);
1545        bool do_cache;
1546
1547        /* IP on this device is disabled. */
1548
1549        if (!in_dev)
1550                goto out;
1551
1552        /* Check for the most weird martians, which can be not detected
1553           by fib_lookup.
1554         */
1555
1556        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1557                goto martian_source;
1558
1559        res.fi = NULL;
1560        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1561                goto brd_input;
1562
1563        /* Accept zero addresses only to limited broadcast;
1564         * I even do not know to fix it or not. Waiting for complains :-)
1565         */
1566        if (ipv4_is_zeronet(saddr))
1567                goto martian_source;
1568
1569        if (ipv4_is_zeronet(daddr))
1570                goto martian_destination;
1571
1572        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
1573                if (ipv4_is_loopback(daddr))
1574                        goto martian_destination;
1575
1576                if (ipv4_is_loopback(saddr))
1577                        goto martian_source;
1578        }
1579
1580        /*
1581         *      Now we are ready to route packet.
1582         */
1583        fl4.flowi4_oif = 0;
1584        fl4.flowi4_iif = dev->ifindex;
1585        fl4.flowi4_mark = skb->mark;
1586        fl4.flowi4_tos = tos;
1587        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1588        fl4.daddr = daddr;
1589        fl4.saddr = saddr;
1590        err = fib_lookup(net, &fl4, &res);
1591        if (err != 0)
1592                goto no_route;
1593
1594        RT_CACHE_STAT_INC(in_slow_tot);
1595
1596        if (res.type == RTN_BROADCAST)
1597                goto brd_input;
1598
1599        if (res.type == RTN_LOCAL) {
1600                err = fib_validate_source(skb, saddr, daddr, tos,
1601                                          net->loopback_dev->ifindex,
1602                                          dev, in_dev, &itag);
1603                if (err < 0)
1604                        goto martian_source_keep_err;
1605                goto local_input;
1606        }
1607
1608        if (!IN_DEV_FORWARD(in_dev))
1609                goto no_route;
1610        if (res.type != RTN_UNICAST)
1611                goto martian_destination;
1612
1613        err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1614out:    return err;
1615
1616brd_input:
1617        if (skb->protocol != htons(ETH_P_IP))
1618                goto e_inval;
1619
1620        if (!ipv4_is_zeronet(saddr)) {
1621                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1622                                          in_dev, &itag);
1623                if (err < 0)
1624                        goto martian_source_keep_err;
1625        }
1626        flags |= RTCF_BROADCAST;
1627        res.type = RTN_BROADCAST;
1628        RT_CACHE_STAT_INC(in_brd);
1629
1630local_input:
1631        do_cache = false;
1632        if (res.fi) {
1633                if (!itag) {
1634                        rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1635                        if (rt_cache_valid(rth)) {
1636                                skb_dst_set_noref(skb, &rth->dst);
1637                                err = 0;
1638                                goto out;
1639                        }
1640                        do_cache = true;
1641                }
1642        }
1643
1644        rth = rt_dst_alloc(net->loopback_dev,
1645                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1646        if (!rth)
1647                goto e_nobufs;
1648
1649        rth->dst.input= ip_local_deliver;
1650        rth->dst.output= ip_rt_bug;
1651#ifdef CONFIG_IP_ROUTE_CLASSID
1652        rth->dst.tclassid = itag;
1653#endif
1654
1655        rth->rt_genid = rt_genid(net);
1656        rth->rt_flags   = flags|RTCF_LOCAL;
1657        rth->rt_type    = res.type;
1658        rth->rt_is_input = 1;
1659        rth->rt_iif     = 0;
1660        rth->rt_pmtu    = 0;
1661        rth->rt_gateway = 0;
1662        rth->rt_uses_gateway = 0;
1663        INIT_LIST_HEAD(&rth->rt_uncached);
1664        if (res.type == RTN_UNREACHABLE) {
1665                rth->dst.input= ip_error;
1666                rth->dst.error= -err;
1667                rth->rt_flags   &= ~RTCF_LOCAL;
1668        }
1669        if (do_cache)
1670                rt_cache_route(&FIB_RES_NH(res), rth);
1671        skb_dst_set(skb, &rth->dst);
1672        err = 0;
1673        goto out;
1674
1675no_route:
1676        RT_CACHE_STAT_INC(in_no_route);
1677        res.type = RTN_UNREACHABLE;
1678        if (err == -ESRCH)
1679                err = -ENETUNREACH;
1680        goto local_input;
1681
1682        /*
1683         *      Do not cache martian addresses: they should be logged (RFC1812)
1684         */
1685martian_destination:
1686        RT_CACHE_STAT_INC(in_martian_dst);
1687#ifdef CONFIG_IP_ROUTE_VERBOSE
1688        if (IN_DEV_LOG_MARTIANS(in_dev))
1689                net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1690                                     &daddr, &saddr, dev->name);
1691#endif
1692
1693e_inval:
1694        err = -EINVAL;
1695        goto out;
1696
1697e_nobufs:
1698        err = -ENOBUFS;
1699        goto out;
1700
1701martian_source:
1702        err = -EINVAL;
1703martian_source_keep_err:
1704        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1705        goto out;
1706}
1707
1708int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1709                         u8 tos, struct net_device *dev)
1710{
1711        int res;
1712
1713        rcu_read_lock();
1714
1715        /* Multicast recognition logic is moved from route cache to here.
1716           The problem was that too many Ethernet cards have broken/missing
1717           hardware multicast filters :-( As result the host on multicasting
1718           network acquires a lot of useless route cache entries, sort of
1719           SDR messages from all the world. Now we try to get rid of them.
1720           Really, provided software IP multicast filter is organized
1721           reasonably (at least, hashed), it does not result in a slowdown
1722           comparing with route cache reject entries.
1723           Note, that multicast routers are not affected, because
1724           route cache entry is created eventually.
1725         */
1726        if (ipv4_is_multicast(daddr)) {
1727                struct in_device *in_dev = __in_dev_get_rcu(dev);
1728
1729                if (in_dev) {
1730                        int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1731                                                  ip_hdr(skb)->protocol);
1732                        if (our
1733#ifdef CONFIG_IP_MROUTE
1734                                ||
1735                            (!ipv4_is_local_multicast(daddr) &&
1736                             IN_DEV_MFORWARD(in_dev))
1737#endif
1738                           ) {
1739                                int res = ip_route_input_mc(skb, daddr, saddr,
1740                                                            tos, dev, our);
1741                                rcu_read_unlock();
1742                                return res;
1743                        }
1744                }
1745                rcu_read_unlock();
1746                return -EINVAL;
1747        }
1748        res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1749        rcu_read_unlock();
1750        return res;
1751}
1752EXPORT_SYMBOL(ip_route_input_noref);
1753
1754/* called with rcu_read_lock() */
1755static struct rtable *__mkroute_output(const struct fib_result *res,
1756                                       const struct flowi4 *fl4, int orig_oif,
1757                                       struct net_device *dev_out,
1758                                       unsigned int flags)
1759{
1760        struct fib_info *fi = res->fi;
1761        struct fib_nh_exception *fnhe;
1762        struct in_device *in_dev;
1763        u16 type = res->type;
1764        struct rtable *rth;
1765        bool do_cache;
1766
1767        in_dev = __in_dev_get_rcu(dev_out);
1768        if (!in_dev)
1769                return ERR_PTR(-EINVAL);
1770
1771        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1772                if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1773                        return ERR_PTR(-EINVAL);
1774
1775        if (ipv4_is_lbcast(fl4->daddr))
1776                type = RTN_BROADCAST;
1777        else if (ipv4_is_multicast(fl4->daddr))
1778                type = RTN_MULTICAST;
1779        else if (ipv4_is_zeronet(fl4->daddr))
1780                return ERR_PTR(-EINVAL);
1781
1782        if (dev_out->flags & IFF_LOOPBACK)
1783                flags |= RTCF_LOCAL;
1784
1785        if (type == RTN_BROADCAST) {
1786                flags |= RTCF_BROADCAST | RTCF_LOCAL;
1787                fi = NULL;
1788        } else if (type == RTN_MULTICAST) {
1789                flags |= RTCF_MULTICAST | RTCF_LOCAL;
1790                if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1791                                     fl4->flowi4_proto))
1792                        flags &= ~RTCF_LOCAL;
1793                /* If multicast route do not exist use
1794                 * default one, but do not gateway in this case.
1795                 * Yes, it is hack.
1796                 */
1797                if (fi && res->prefixlen < 4)
1798                        fi = NULL;
1799        }
1800
1801        fnhe = NULL;
1802        do_cache = fi != NULL;
1803        if (fi) {
1804                struct rtable __rcu **prth;
1805                struct fib_nh *nh = &FIB_RES_NH(*res);
1806
1807                fnhe = find_exception(nh, fl4->daddr);
1808                if (fnhe)
1809                        prth = &fnhe->fnhe_rth;
1810                else {
1811                        if (unlikely(fl4->flowi4_flags &
1812                                     FLOWI_FLAG_KNOWN_NH &&
1813                                     !(nh->nh_gw &&
1814                                       nh->nh_scope == RT_SCOPE_LINK))) {
1815                                do_cache = false;
1816                                goto add;
1817                        }
1818                        prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1819                }
1820                rth = rcu_dereference(*prth);
1821                if (rt_cache_valid(rth)) {
1822                        dst_hold(&rth->dst);
1823                        return rth;
1824                }
1825        }
1826
1827add:
1828        rth = rt_dst_alloc(dev_out,
1829                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
1830                           IN_DEV_CONF_GET(in_dev, NOXFRM),
1831                           do_cache);
1832        if (!rth)
1833                return ERR_PTR(-ENOBUFS);
1834
1835        rth->dst.output = ip_output;
1836
1837        rth->rt_genid = rt_genid(dev_net(dev_out));
1838        rth->rt_flags   = flags;
1839        rth->rt_type    = type;
1840        rth->rt_is_input = 0;
1841        rth->rt_iif     = orig_oif ? : 0;
1842        rth->rt_pmtu    = 0;
1843        rth->rt_gateway = 0;
1844        rth->rt_uses_gateway = 0;
1845        INIT_LIST_HEAD(&rth->rt_uncached);
1846
1847        RT_CACHE_STAT_INC(out_slow_tot);
1848
1849        if (flags & RTCF_LOCAL)
1850                rth->dst.input = ip_local_deliver;
1851        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1852                if (flags & RTCF_LOCAL &&
1853                    !(dev_out->flags & IFF_LOOPBACK)) {
1854                        rth->dst.output = ip_mc_output;
1855                        RT_CACHE_STAT_INC(out_slow_mc);
1856                }
1857#ifdef CONFIG_IP_MROUTE
1858                if (type == RTN_MULTICAST) {
1859                        if (IN_DEV_MFORWARD(in_dev) &&
1860                            !ipv4_is_local_multicast(fl4->daddr)) {
1861                                rth->dst.input = ip_mr_input;
1862                                rth->dst.output = ip_mc_output;
1863                        }
1864                }
1865#endif
1866        }
1867
1868        rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1869
1870        return rth;
1871}
1872
1873/*
1874 * Major route resolver routine.
1875 */
1876
1877struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1878{
1879        struct net_device *dev_out = NULL;
1880        __u8 tos = RT_FL_TOS(fl4);
1881        unsigned int flags = 0;
1882        struct fib_result res;
1883        struct rtable *rth;
1884        int orig_oif;
1885
1886        res.tclassid    = 0;
1887        res.fi          = NULL;
1888        res.table       = NULL;
1889
1890        orig_oif = fl4->flowi4_oif;
1891
1892        fl4->flowi4_iif = net->loopback_dev->ifindex;
1893        fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1894        fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1895                         RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1896
1897        rcu_read_lock();
1898        if (fl4->saddr) {
1899                rth = ERR_PTR(-EINVAL);
1900                if (ipv4_is_multicast(fl4->saddr) ||
1901                    ipv4_is_lbcast(fl4->saddr) ||
1902                    ipv4_is_zeronet(fl4->saddr))
1903                        goto out;
1904
1905                /* I removed check for oif == dev_out->oif here.
1906                   It was wrong for two reasons:
1907                   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1908                      is assigned to multiple interfaces.
1909                   2. Moreover, we are allowed to send packets with saddr
1910                      of another iface. --ANK
1911                 */
1912
1913                if (fl4->flowi4_oif == 0 &&
1914                    (ipv4_is_multicast(fl4->daddr) ||
1915                     ipv4_is_lbcast(fl4->daddr))) {
1916                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1917                        dev_out = __ip_dev_find(net, fl4->saddr, false);
1918                        if (dev_out == NULL)
1919                                goto out;
1920
1921                        /* Special hack: user can direct multicasts
1922                           and limited broadcast via necessary interface
1923                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1924                           This hack is not just for fun, it allows
1925                           vic,vat and friends to work.
1926                           They bind socket to loopback, set ttl to zero
1927                           and expect that it will work.
1928                           From the viewpoint of routing cache they are broken,
1929                           because we are not allowed to build multicast path
1930                           with loopback source addr (look, routing cache
1931                           cannot know, that ttl is zero, so that packet
1932                           will not leave this host and route is valid).
1933                           Luckily, this hack is good workaround.
1934                         */
1935
1936                        fl4->flowi4_oif = dev_out->ifindex;
1937                        goto make_route;
1938                }
1939
1940                if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1941                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1942                        if (!__ip_dev_find(net, fl4->saddr, false))
1943                                goto out;
1944                }
1945        }
1946
1947
1948        if (fl4->flowi4_oif) {
1949                dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
1950                rth = ERR_PTR(-ENODEV);
1951                if (dev_out == NULL)
1952                        goto out;
1953
1954                /* RACE: Check return value of inet_select_addr instead. */
1955                if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
1956                        rth = ERR_PTR(-ENETUNREACH);
1957                        goto out;
1958                }
1959                if (ipv4_is_local_multicast(fl4->daddr) ||
1960                    ipv4_is_lbcast(fl4->daddr)) {
1961                        if (!fl4->saddr)
1962                                fl4->saddr = inet_select_addr(dev_out, 0,
1963                                                              RT_SCOPE_LINK);
1964                        goto make_route;
1965                }
1966                if (fl4->saddr) {
1967                        if (ipv4_is_multicast(fl4->daddr))
1968                                fl4->saddr = inet_select_addr(dev_out, 0,
1969                                                              fl4->flowi4_scope);
1970                        else if (!fl4->daddr)
1971                                fl4->saddr = inet_select_addr(dev_out, 0,
1972                                                              RT_SCOPE_HOST);
1973                }
1974        }
1975
1976        if (!fl4->daddr) {
1977                fl4->daddr = fl4->saddr;
1978                if (!fl4->daddr)
1979                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1980                dev_out = net->loopback_dev;
1981                fl4->flowi4_oif = net->loopback_dev->ifindex;
1982                res.type = RTN_LOCAL;
1983                flags |= RTCF_LOCAL;
1984                goto make_route;
1985        }
1986
1987        if (fib_lookup(net, fl4, &res)) {
1988                res.fi = NULL;
1989                res.table = NULL;
1990                if (fl4->flowi4_oif) {
1991                        /* Apparently, routing tables are wrong. Assume,
1992                           that the destination is on link.
1993
1994                           WHY? DW.
1995                           Because we are allowed to send to iface
1996                           even if it has NO routes and NO assigned
1997                           addresses. When oif is specified, routing
1998                           tables are looked up with only one purpose:
1999                           to catch if destination is gatewayed, rather than
2000                           direct. Moreover, if MSG_DONTROUTE is set,
2001                           we send packet, ignoring both routing tables
2002                           and ifaddr state. --ANK
2003
2004
2005                           We could make it even if oif is unknown,
2006                           likely IPv6, but we do not.
2007                         */
2008
2009                        if (fl4->saddr == 0)
2010                                fl4->saddr = inet_select_addr(dev_out, 0,
2011                                                              RT_SCOPE_LINK);
2012                        res.type = RTN_UNICAST;
2013                        goto make_route;
2014                }
2015                rth = ERR_PTR(-ENETUNREACH);
2016                goto out;
2017        }
2018
2019        if (res.type == RTN_LOCAL) {
2020                if (!fl4->saddr) {
2021                        if (res.fi->fib_prefsrc)
2022                                fl4->saddr = res.fi->fib_prefsrc;
2023                        else
2024                                fl4->saddr = fl4->daddr;
2025                }
2026                dev_out = net->loopback_dev;
2027                fl4->flowi4_oif = dev_out->ifindex;
2028                flags |= RTCF_LOCAL;
2029                goto make_route;
2030        }
2031
2032#ifdef CONFIG_IP_ROUTE_MULTIPATH
2033        if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2034                fib_select_multipath(&res);
2035        else
2036#endif
2037        if (!res.prefixlen &&
2038            res.table->tb_num_default > 1 &&
2039            res.type == RTN_UNICAST && !fl4->flowi4_oif)
2040                fib_select_default(&res);
2041
2042        if (!fl4->saddr)
2043                fl4->saddr = FIB_RES_PREFSRC(net, res);
2044
2045        dev_out = FIB_RES_DEV(res);
2046        fl4->flowi4_oif = dev_out->ifindex;
2047
2048
2049make_route:
2050        rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2051
2052out:
2053        rcu_read_unlock();
2054        return rth;
2055}
2056EXPORT_SYMBOL_GPL(__ip_route_output_key);
2057
2058static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2059{
2060        return NULL;
2061}
2062
2063static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2064{
2065        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2066
2067        return mtu ? : dst->dev->mtu;
2068}
2069
2070static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2071                                          struct sk_buff *skb, u32 mtu)
2072{
2073}
2074
2075static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2076                                       struct sk_buff *skb)
2077{
2078}
2079
2080static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2081                                          unsigned long old)
2082{
2083        return NULL;
2084}
2085
2086static struct dst_ops ipv4_dst_blackhole_ops = {
2087        .family                 =       AF_INET,
2088        .protocol               =       cpu_to_be16(ETH_P_IP),
2089        .check                  =       ipv4_blackhole_dst_check,
2090        .mtu                    =       ipv4_blackhole_mtu,
2091        .default_advmss         =       ipv4_default_advmss,
2092        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2093        .redirect               =       ipv4_rt_blackhole_redirect,
2094        .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2095        .neigh_lookup           =       ipv4_neigh_lookup,
2096};
2097
2098struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2099{
2100        struct rtable *ort = (struct rtable *) dst_orig;
2101        struct rtable *rt;
2102
2103        rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2104        if (rt) {
2105                struct dst_entry *new = &rt->dst;
2106
2107                new->__use = 1;
2108                new->input = dst_discard;
2109                new->output = dst_discard;
2110
2111                new->dev = ort->dst.dev;
2112                if (new->dev)
2113                        dev_hold(new->dev);
2114
2115                rt->rt_is_input = ort->rt_is_input;
2116                rt->rt_iif = ort->rt_iif;
2117                rt->rt_pmtu = ort->rt_pmtu;
2118
2119                rt->rt_genid = rt_genid(net);
2120                rt->rt_flags = ort->rt_flags;
2121                rt->rt_type = ort->rt_type;
2122                rt->rt_gateway = ort->rt_gateway;
2123                rt->rt_uses_gateway = ort->rt_uses_gateway;
2124
2125                INIT_LIST_HEAD(&rt->rt_uncached);
2126
2127                dst_free(new);
2128        }
2129
2130        dst_release(dst_orig);
2131
2132        return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2133}
2134
2135struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2136                                    struct sock *sk)
2137{
2138        struct rtable *rt = __ip_route_output_key(net, flp4);
2139
2140        if (IS_ERR(rt))
2141                return rt;
2142
2143        if (flp4->flowi4_proto)
2144                rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2145                                                   flowi4_to_flowi(flp4),
2146                                                   sk, 0);
2147
2148        return rt;
2149}
2150EXPORT_SYMBOL_GPL(ip_route_output_flow);
2151
2152static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2153                        struct flowi4 *fl4, struct sk_buff *skb, u32 pid,
2154                        u32 seq, int event, int nowait, unsigned int flags)
2155{
2156        struct rtable *rt = skb_rtable(skb);
2157        struct rtmsg *r;
2158        struct nlmsghdr *nlh;
2159        unsigned long expires = 0;
2160        u32 error;
2161        u32 metrics[RTAX_MAX];
2162
2163        nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2164        if (nlh == NULL)
2165                return -EMSGSIZE;
2166
2167        r = nlmsg_data(nlh);
2168        r->rtm_family    = AF_INET;
2169        r->rtm_dst_len  = 32;
2170        r->rtm_src_len  = 0;
2171        r->rtm_tos      = fl4->flowi4_tos;
2172        r->rtm_table    = RT_TABLE_MAIN;
2173        if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2174                goto nla_put_failure;
2175        r->rtm_type     = rt->rt_type;
2176        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2177        r->rtm_protocol = RTPROT_UNSPEC;
2178        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2179        if (rt->rt_flags & RTCF_NOTIFY)
2180                r->rtm_flags |= RTM_F_NOTIFY;
2181
2182        if (nla_put_be32(skb, RTA_DST, dst))
2183                goto nla_put_failure;
2184        if (src) {
2185                r->rtm_src_len = 32;
2186                if (nla_put_be32(skb, RTA_SRC, src))
2187                        goto nla_put_failure;
2188        }
2189        if (rt->dst.dev &&
2190            nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2191                goto nla_put_failure;
2192#ifdef CONFIG_IP_ROUTE_CLASSID
2193        if (rt->dst.tclassid &&
2194            nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2195                goto nla_put_failure;
2196#endif
2197        if (!rt_is_input_route(rt) &&
2198            fl4->saddr != src) {
2199                if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2200                        goto nla_put_failure;
2201        }
2202        if (rt->rt_uses_gateway &&
2203            nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2204                goto nla_put_failure;
2205
2206        expires = rt->dst.expires;
2207        if (expires) {
2208                unsigned long now = jiffies;
2209
2210                if (time_before(now, expires))
2211                        expires -= now;
2212                else
2213                        expires = 0;
2214        }
2215
2216        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2217        if (rt->rt_pmtu && expires)
2218                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2219        if (rtnetlink_put_metrics(skb, metrics) < 0)
2220                goto nla_put_failure;
2221
2222        if (fl4->flowi4_mark &&
2223            nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
2224                goto nla_put_failure;
2225
2226        error = rt->dst.error;
2227
2228        if (rt_is_input_route(rt)) {
2229                if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2230                        goto nla_put_failure;
2231        }
2232
2233        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2234                goto nla_put_failure;
2235
2236        return nlmsg_end(skb, nlh);
2237
2238nla_put_failure:
2239        nlmsg_cancel(skb, nlh);
2240        return -EMSGSIZE;
2241}
2242
2243static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2244{
2245        struct net *net = sock_net(in_skb->sk);
2246        struct rtmsg *rtm;
2247        struct nlattr *tb[RTA_MAX+1];
2248        struct rtable *rt = NULL;
2249        struct flowi4 fl4;
2250        __be32 dst = 0;
2251        __be32 src = 0;
2252        u32 iif;
2253        int err;
2254        int mark;
2255        struct sk_buff *skb;
2256
2257        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2258        if (err < 0)
2259                goto errout;
2260
2261        rtm = nlmsg_data(nlh);
2262
2263        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2264        if (skb == NULL) {
2265                err = -ENOBUFS;
2266                goto errout;
2267        }
2268
2269        /* Reserve room for dummy headers, this skb can pass
2270           through good chunk of routing engine.
2271         */
2272        skb_reset_mac_header(skb);
2273        skb_reset_network_header(skb);
2274
2275        /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2276        ip_hdr(skb)->protocol = IPPROTO_ICMP;
2277        skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2278
2279        src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2280        dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2281        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2282        mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2283
2284        memset(&fl4, 0, sizeof(fl4));
2285        fl4.daddr = dst;
2286        fl4.saddr = src;
2287        fl4.flowi4_tos = rtm->rtm_tos;
2288        fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2289        fl4.flowi4_mark = mark;
2290
2291        if (iif) {
2292                struct net_device *dev;
2293
2294                dev = __dev_get_by_index(net, iif);
2295                if (dev == NULL) {
2296                        err = -ENODEV;
2297                        goto errout_free;
2298                }
2299
2300                skb->protocol   = htons(ETH_P_IP);
2301                skb->dev        = dev;
2302                skb->mark       = mark;
2303                local_bh_disable();
2304                err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2305                local_bh_enable();
2306
2307                rt = skb_rtable(skb);
2308                if (err == 0 && rt->dst.error)
2309                        err = -rt->dst.error;
2310        } else {
2311                rt = ip_route_output_key(net, &fl4);
2312
2313                err = 0;
2314                if (IS_ERR(rt))
2315                        err = PTR_ERR(rt);
2316        }
2317
2318        if (err)
2319                goto errout_free;
2320
2321        skb_dst_set(skb, &rt->dst);
2322        if (rtm->rtm_flags & RTM_F_NOTIFY)
2323                rt->rt_flags |= RTCF_NOTIFY;
2324
2325        err = rt_fill_info(net, dst, src, &fl4, skb,
2326                           NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2327                           RTM_NEWROUTE, 0, 0);
2328        if (err <= 0)
2329                goto errout_free;
2330
2331        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2332errout:
2333        return err;
2334
2335errout_free:
2336        kfree_skb(skb);
2337        goto errout;
2338}
2339
2340int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2341{
2342        return skb->len;
2343}
2344
2345void ip_rt_multicast_event(struct in_device *in_dev)
2346{
2347        rt_cache_flush(dev_net(in_dev->dev));
2348}
2349
2350#ifdef CONFIG_SYSCTL
2351static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2352                                        void __user *buffer,
2353                                        size_t *lenp, loff_t *ppos)
2354{
2355        if (write) {
2356                rt_cache_flush((struct net *)__ctl->extra1);
2357                return 0;
2358        }
2359
2360        return -EINVAL;
2361}
2362
2363static ctl_table ipv4_route_table[] = {
2364        {
2365                .procname       = "gc_thresh",
2366                .data           = &ipv4_dst_ops.gc_thresh,
2367                .maxlen         = sizeof(int),
2368                .mode           = 0644,
2369                .proc_handler   = proc_dointvec,
2370        },
2371        {
2372                .procname       = "max_size",
2373                .data           = &ip_rt_max_size,
2374                .maxlen         = sizeof(int),
2375                .mode           = 0644,
2376                .proc_handler   = proc_dointvec,
2377        },
2378        {
2379                /*  Deprecated. Use gc_min_interval_ms */
2380
2381                .procname       = "gc_min_interval",
2382                .data           = &ip_rt_gc_min_interval,
2383                .maxlen         = sizeof(int),
2384                .mode           = 0644,
2385                .proc_handler   = proc_dointvec_jiffies,
2386        },
2387        {
2388                .procname       = "gc_min_interval_ms",
2389                .data           = &ip_rt_gc_min_interval,
2390                .maxlen         = sizeof(int),
2391                .mode           = 0644,
2392                .proc_handler   = proc_dointvec_ms_jiffies,
2393        },
2394        {
2395                .procname       = "gc_timeout",
2396                .data           = &ip_rt_gc_timeout,
2397                .maxlen         = sizeof(int),
2398                .mode           = 0644,
2399                .proc_handler   = proc_dointvec_jiffies,
2400        },
2401        {
2402                .procname       = "gc_interval",
2403                .data           = &ip_rt_gc_interval,
2404                .maxlen         = sizeof(int),
2405                .mode           = 0644,
2406                .proc_handler   = proc_dointvec_jiffies,
2407        },
2408        {
2409                .procname       = "redirect_load",
2410                .data           = &ip_rt_redirect_load,
2411                .maxlen         = sizeof(int),
2412                .mode           = 0644,
2413                .proc_handler   = proc_dointvec,
2414        },
2415        {
2416                .procname       = "redirect_number",
2417                .data           = &ip_rt_redirect_number,
2418                .maxlen         = sizeof(int),
2419                .mode           = 0644,
2420                .proc_handler   = proc_dointvec,
2421        },
2422        {
2423                .procname       = "redirect_silence",
2424                .data           = &ip_rt_redirect_silence,
2425                .maxlen         = sizeof(int),
2426                .mode           = 0644,
2427                .proc_handler   = proc_dointvec,
2428        },
2429        {
2430                .procname       = "error_cost",
2431                .data           = &ip_rt_error_cost,
2432                .maxlen         = sizeof(int),
2433                .mode           = 0644,
2434                .proc_handler   = proc_dointvec,
2435        },
2436        {
2437                .procname       = "error_burst",
2438                .data           = &ip_rt_error_burst,
2439                .maxlen         = sizeof(int),
2440                .mode           = 0644,
2441                .proc_handler   = proc_dointvec,
2442        },
2443        {
2444                .procname       = "gc_elasticity",
2445                .data           = &ip_rt_gc_elasticity,
2446                .maxlen         = sizeof(int),
2447                .mode           = 0644,
2448                .proc_handler   = proc_dointvec,
2449        },
2450        {
2451                .procname       = "mtu_expires",
2452                .data           = &ip_rt_mtu_expires,
2453                .maxlen         = sizeof(int),
2454                .mode           = 0644,
2455                .proc_handler   = proc_dointvec_jiffies,
2456        },
2457        {
2458                .procname       = "min_pmtu",
2459                .data           = &ip_rt_min_pmtu,
2460                .maxlen         = sizeof(int),
2461                .mode           = 0644,
2462                .proc_handler   = proc_dointvec,
2463        },
2464        {
2465                .procname       = "min_adv_mss",
2466                .data           = &ip_rt_min_advmss,
2467                .maxlen         = sizeof(int),
2468                .mode           = 0644,
2469                .proc_handler   = proc_dointvec,
2470        },
2471        { }
2472};
2473
2474static struct ctl_table ipv4_route_flush_table[] = {
2475        {
2476                .procname       = "flush",
2477                .maxlen         = sizeof(int),
2478                .mode           = 0200,
2479                .proc_handler   = ipv4_sysctl_rtcache_flush,
2480        },
2481        { },
2482};
2483
2484static __net_init int sysctl_route_net_init(struct net *net)
2485{
2486        struct ctl_table *tbl;
2487
2488        tbl = ipv4_route_flush_table;
2489        if (!net_eq(net, &init_net)) {
2490                tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2491                if (tbl == NULL)
2492                        goto err_dup;
2493        }
2494        tbl[0].extra1 = net;
2495
2496        net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2497        if (net->ipv4.route_hdr == NULL)
2498                goto err_reg;
2499        return 0;
2500
2501err_reg:
2502        if (tbl != ipv4_route_flush_table)
2503                kfree(tbl);
2504err_dup:
2505        return -ENOMEM;
2506}
2507
2508static __net_exit void sysctl_route_net_exit(struct net *net)
2509{
2510        struct ctl_table *tbl;
2511
2512        tbl = net->ipv4.route_hdr->ctl_table_arg;
2513        unregister_net_sysctl_table(net->ipv4.route_hdr);
2514        BUG_ON(tbl == ipv4_route_flush_table);
2515        kfree(tbl);
2516}
2517
2518static __net_initdata struct pernet_operations sysctl_route_ops = {
2519        .init = sysctl_route_net_init,
2520        .exit = sysctl_route_net_exit,
2521};
2522#endif
2523
2524static __net_init int rt_genid_init(struct net *net)
2525{
2526        atomic_set(&net->rt_genid, 0);
2527        get_random_bytes(&net->ipv4.dev_addr_genid,
2528                         sizeof(net->ipv4.dev_addr_genid));
2529        return 0;
2530}
2531
2532static __net_initdata struct pernet_operations rt_genid_ops = {
2533        .init = rt_genid_init,
2534};
2535
2536static int __net_init ipv4_inetpeer_init(struct net *net)
2537{
2538        struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2539
2540        if (!bp)
2541                return -ENOMEM;
2542        inet_peer_base_init(bp);
2543        net->ipv4.peers = bp;
2544        return 0;
2545}
2546
2547static void __net_exit ipv4_inetpeer_exit(struct net *net)
2548{
2549        struct inet_peer_base *bp = net->ipv4.peers;
2550
2551        net->ipv4.peers = NULL;
2552        inetpeer_invalidate_tree(bp);
2553        kfree(bp);
2554}
2555
2556static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2557        .init   =       ipv4_inetpeer_init,
2558        .exit   =       ipv4_inetpeer_exit,
2559};
2560
2561#ifdef CONFIG_IP_ROUTE_CLASSID
2562struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2563#endif /* CONFIG_IP_ROUTE_CLASSID */
2564
2565int __init ip_rt_init(void)
2566{
2567        int rc = 0;
2568
2569#ifdef CONFIG_IP_ROUTE_CLASSID
2570        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2571        if (!ip_rt_acct)
2572                panic("IP: failed to allocate ip_rt_acct\n");
2573#endif
2574
2575        ipv4_dst_ops.kmem_cachep =
2576                kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2577                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2578
2579        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2580
2581        if (dst_entries_init(&ipv4_dst_ops) < 0)
2582                panic("IP: failed to allocate ipv4_dst_ops counter\n");
2583
2584        if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2585                panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2586
2587        ipv4_dst_ops.gc_thresh = ~0;
2588        ip_rt_max_size = INT_MAX;
2589
2590        devinet_init();
2591        ip_fib_init();
2592
2593        if (ip_rt_proc_init())
2594                pr_err("Unable to create route proc files\n");
2595#ifdef CONFIG_XFRM
2596        xfrm_init();
2597        xfrm4_init(ip_rt_max_size);
2598#endif
2599        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2600
2601#ifdef CONFIG_SYSCTL
2602        register_pernet_subsys(&sysctl_route_ops);
2603#endif
2604        register_pernet_subsys(&rt_genid_ops);
2605        register_pernet_subsys(&ipv4_inetpeer_ops);
2606        return rc;
2607}
2608
2609#ifdef CONFIG_SYSCTL
2610/*
2611 * We really need to sanitize the damn ipv4 init order, then all
2612 * this nonsense will go away.
2613 */
2614void __init ip_static_sysctl_init(void)
2615{
2616        register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2617}
2618#endif
2619
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.