linux/net/ipv4/route.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              ROUTE - implementation of the IP router.
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13 *
  14 * Fixes:
  15 *              Alan Cox        :       Verify area fixes.
  16 *              Alan Cox        :       cli() protects routing changes
  17 *              Rui Oliveira    :       ICMP routing table updates
  18 *              (rco@di.uminho.pt)      Routing table insertion and update
  19 *              Linus Torvalds  :       Rewrote bits to be sensible
  20 *              Alan Cox        :       Added BSD route gw semantics
  21 *              Alan Cox        :       Super /proc >4K
  22 *              Alan Cox        :       MTU in route table
  23 *              Alan Cox        :       MSS actually. Also added the window
  24 *                                      clamper.
  25 *              Sam Lantinga    :       Fixed route matching in rt_del()
  26 *              Alan Cox        :       Routing cache support.
  27 *              Alan Cox        :       Removed compatibility cruft.
  28 *              Alan Cox        :       RTF_REJECT support.
  29 *              Alan Cox        :       TCP irtt support.
  30 *              Jonathan Naylor :       Added Metric support.
  31 *      Miquel van Smoorenburg  :       BSD API fixes.
  32 *      Miquel van Smoorenburg  :       Metrics.
  33 *              Alan Cox        :       Use __u32 properly
  34 *              Alan Cox        :       Aligned routing errors more closely with BSD
  35 *                                      our system is still very different.
  36 *              Alan Cox        :       Faster /proc handling
  37 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38 *                                      routing caches and better behaviour.
  39 *
  40 *              Olaf Erb        :       irtt wasn't being copied right.
  41 *              Bjorn Ekwall    :       Kerneld route support.
  42 *              Alan Cox        :       Multicast fixed (I hope)
  43 *              Pavel Krauz     :       Limited broadcast fixed
  44 *              Mike McLagan    :       Routing by source
  45 *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46 *                                      route.c and rewritten from scratch.
  47 *              Andi Kleen      :       Load-limit warning messages.
  48 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52 *              Marc Boucher    :       routing by fwmark
  53 *      Robert Olsson           :       Added rt_cache statistics
  54 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55 *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56 *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57 *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58 *
  59 *              This program is free software; you can redistribute it and/or
  60 *              modify it under the terms of the GNU General Public License
  61 *              as published by the Free Software Foundation; either version
  62 *              2 of the License, or (at your option) any later version.
  63 */
  64
  65#define pr_fmt(fmt) "IPv4: " fmt
  66
  67#include <linux/module.h>
  68#include <asm/uaccess.h>
  69#include <linux/bitops.h>
  70#include <linux/types.h>
  71#include <linux/kernel.h>
  72#include <linux/mm.h>
  73#include <linux/string.h>
  74#include <linux/socket.h>
  75#include <linux/sockios.h>
  76#include <linux/errno.h>
  77#include <linux/in.h>
  78#include <linux/inet.h>
  79#include <linux/netdevice.h>
  80#include <linux/proc_fs.h>
  81#include <linux/init.h>
  82#include <linux/skbuff.h>
  83#include <linux/inetdevice.h>
  84#include <linux/igmp.h>
  85#include <linux/pkt_sched.h>
  86#include <linux/mroute.h>
  87#include <linux/netfilter_ipv4.h>
  88#include <linux/random.h>
  89#include <linux/rcupdate.h>
  90#include <linux/times.h>
  91#include <linux/slab.h>
  92#include <net/dst.h>
  93#include <net/net_namespace.h>
  94#include <net/protocol.h>
  95#include <net/ip.h>
  96#include <net/route.h>
  97#include <net/inetpeer.h>
  98#include <net/sock.h>
  99#include <net/ip_fib.h>
 100#include <net/arp.h>
 101#include <net/tcp.h>
 102#include <net/icmp.h>
 103#include <net/xfrm.h>
 104#include <net/netevent.h>
 105#include <net/rtnetlink.h>
 106#ifdef CONFIG_SYSCTL
 107#include <linux/sysctl.h>
 108#include <linux/kmemleak.h>
 109#endif
 110#include <net/secure_seq.h>
 111
 112#define RT_FL_TOS(oldflp4) \
 113        ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 114
 115#define IP_MAX_MTU      0xFFF0
 116
 117#define RT_GC_TIMEOUT (300*HZ)
 118
 119static int ip_rt_max_size;
 120static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 121static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 122static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 123static int ip_rt_redirect_number __read_mostly  = 9;
 124static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 125static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 126static int ip_rt_error_cost __read_mostly       = HZ;
 127static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 128static int ip_rt_gc_elasticity __read_mostly    = 8;
 129static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 130static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 131static int ip_rt_min_advmss __read_mostly       = 256;
 132
 133/*
 134 *      Interface to generic destination cache.
 135 */
 136
 137static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 138static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 139static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 140static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 141static void              ipv4_link_failure(struct sk_buff *skb);
 142static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 143                                           struct sk_buff *skb, u32 mtu);
 144static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 145                                        struct sk_buff *skb);
 146static void             ipv4_dst_destroy(struct dst_entry *dst);
 147
 148static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 149                            int how)
 150{
 151}
 152
 153static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 154{
 155        WARN_ON(1);
 156        return NULL;
 157}
 158
 159static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 160                                           struct sk_buff *skb,
 161                                           const void *daddr);
 162
 163static struct dst_ops ipv4_dst_ops = {
 164        .family =               AF_INET,
 165        .protocol =             cpu_to_be16(ETH_P_IP),
 166        .check =                ipv4_dst_check,
 167        .default_advmss =       ipv4_default_advmss,
 168        .mtu =                  ipv4_mtu,
 169        .cow_metrics =          ipv4_cow_metrics,
 170        .destroy =              ipv4_dst_destroy,
 171        .ifdown =               ipv4_dst_ifdown,
 172        .negative_advice =      ipv4_negative_advice,
 173        .link_failure =         ipv4_link_failure,
 174        .update_pmtu =          ip_rt_update_pmtu,
 175        .redirect =             ip_do_redirect,
 176        .local_out =            __ip_local_out,
 177        .neigh_lookup =         ipv4_neigh_lookup,
 178};
 179
 180#define ECN_OR_COST(class)      TC_PRIO_##class
 181
 182const __u8 ip_tos2prio[16] = {
 183        TC_PRIO_BESTEFFORT,
 184        ECN_OR_COST(BESTEFFORT),
 185        TC_PRIO_BESTEFFORT,
 186        ECN_OR_COST(BESTEFFORT),
 187        TC_PRIO_BULK,
 188        ECN_OR_COST(BULK),
 189        TC_PRIO_BULK,
 190        ECN_OR_COST(BULK),
 191        TC_PRIO_INTERACTIVE,
 192        ECN_OR_COST(INTERACTIVE),
 193        TC_PRIO_INTERACTIVE,
 194        ECN_OR_COST(INTERACTIVE),
 195        TC_PRIO_INTERACTIVE_BULK,
 196        ECN_OR_COST(INTERACTIVE_BULK),
 197        TC_PRIO_INTERACTIVE_BULK,
 198        ECN_OR_COST(INTERACTIVE_BULK)
 199};
 200EXPORT_SYMBOL(ip_tos2prio);
 201
 202static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 203#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 204
 205#ifdef CONFIG_PROC_FS
 206static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 207{
 208        if (*pos)
 209                return NULL;
 210        return SEQ_START_TOKEN;
 211}
 212
 213static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 214{
 215        ++*pos;
 216        return NULL;
 217}
 218
 219static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 220{
 221}
 222
 223static int rt_cache_seq_show(struct seq_file *seq, void *v)
 224{
 225        if (v == SEQ_START_TOKEN)
 226                seq_printf(seq, "%-127s\n",
 227                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 228                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 229                           "HHUptod\tSpecDst");
 230        return 0;
 231}
 232
 233static const struct seq_operations rt_cache_seq_ops = {
 234        .start  = rt_cache_seq_start,
 235        .next   = rt_cache_seq_next,
 236        .stop   = rt_cache_seq_stop,
 237        .show   = rt_cache_seq_show,
 238};
 239
 240static int rt_cache_seq_open(struct inode *inode, struct file *file)
 241{
 242        return seq_open(file, &rt_cache_seq_ops);
 243}
 244
 245static const struct file_operations rt_cache_seq_fops = {
 246        .owner   = THIS_MODULE,
 247        .open    = rt_cache_seq_open,
 248        .read    = seq_read,
 249        .llseek  = seq_lseek,
 250        .release = seq_release,
 251};
 252
 253
 254static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 255{
 256        int cpu;
 257
 258        if (*pos == 0)
 259                return SEQ_START_TOKEN;
 260
 261        for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 262                if (!cpu_possible(cpu))
 263                        continue;
 264                *pos = cpu+1;
 265                return &per_cpu(rt_cache_stat, cpu);
 266        }
 267        return NULL;
 268}
 269
 270static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 271{
 272        int cpu;
 273
 274        for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 275                if (!cpu_possible(cpu))
 276                        continue;
 277                *pos = cpu+1;
 278                return &per_cpu(rt_cache_stat, cpu);
 279        }
 280        return NULL;
 281
 282}
 283
 284static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 285{
 286
 287}
 288
 289static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 290{
 291        struct rt_cache_stat *st = v;
 292
 293        if (v == SEQ_START_TOKEN) {
 294                seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 295                return 0;
 296        }
 297
 298        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 299                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 300                   dst_entries_get_slow(&ipv4_dst_ops),
 301                   st->in_hit,
 302                   st->in_slow_tot,
 303                   st->in_slow_mc,
 304                   st->in_no_route,
 305                   st->in_brd,
 306                   st->in_martian_dst,
 307                   st->in_martian_src,
 308
 309                   st->out_hit,
 310                   st->out_slow_tot,
 311                   st->out_slow_mc,
 312
 313                   st->gc_total,
 314                   st->gc_ignored,
 315                   st->gc_goal_miss,
 316                   st->gc_dst_overflow,
 317                   st->in_hlist_search,
 318                   st->out_hlist_search
 319                );
 320        return 0;
 321}
 322
 323static const struct seq_operations rt_cpu_seq_ops = {
 324        .start  = rt_cpu_seq_start,
 325        .next   = rt_cpu_seq_next,
 326        .stop   = rt_cpu_seq_stop,
 327        .show   = rt_cpu_seq_show,
 328};
 329
 330
 331static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 332{
 333        return seq_open(file, &rt_cpu_seq_ops);
 334}
 335
 336static const struct file_operations rt_cpu_seq_fops = {
 337        .owner   = THIS_MODULE,
 338        .open    = rt_cpu_seq_open,
 339        .read    = seq_read,
 340        .llseek  = seq_lseek,
 341        .release = seq_release,
 342};
 343
 344#ifdef CONFIG_IP_ROUTE_CLASSID
 345static int rt_acct_proc_show(struct seq_file *m, void *v)
 346{
 347        struct ip_rt_acct *dst, *src;
 348        unsigned int i, j;
 349
 350        dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 351        if (!dst)
 352                return -ENOMEM;
 353
 354        for_each_possible_cpu(i) {
 355                src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 356                for (j = 0; j < 256; j++) {
 357                        dst[j].o_bytes   += src[j].o_bytes;
 358                        dst[j].o_packets += src[j].o_packets;
 359                        dst[j].i_bytes   += src[j].i_bytes;
 360                        dst[j].i_packets += src[j].i_packets;
 361                }
 362        }
 363
 364        seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 365        kfree(dst);
 366        return 0;
 367}
 368
 369static int rt_acct_proc_open(struct inode *inode, struct file *file)
 370{
 371        return single_open(file, rt_acct_proc_show, NULL);
 372}
 373
 374static const struct file_operations rt_acct_proc_fops = {
 375        .owner          = THIS_MODULE,
 376        .open           = rt_acct_proc_open,
 377        .read           = seq_read,
 378        .llseek         = seq_lseek,
 379        .release        = single_release,
 380};
 381#endif
 382
 383static int __net_init ip_rt_do_proc_init(struct net *net)
 384{
 385        struct proc_dir_entry *pde;
 386
 387        pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 388                        &rt_cache_seq_fops);
 389        if (!pde)
 390                goto err1;
 391
 392        pde = proc_create("rt_cache", S_IRUGO,
 393                          net->proc_net_stat, &rt_cpu_seq_fops);
 394        if (!pde)
 395                goto err2;
 396
 397#ifdef CONFIG_IP_ROUTE_CLASSID
 398        pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 399        if (!pde)
 400                goto err3;
 401#endif
 402        return 0;
 403
 404#ifdef CONFIG_IP_ROUTE_CLASSID
 405err3:
 406        remove_proc_entry("rt_cache", net->proc_net_stat);
 407#endif
 408err2:
 409        remove_proc_entry("rt_cache", net->proc_net);
 410err1:
 411        return -ENOMEM;
 412}
 413
 414static void __net_exit ip_rt_do_proc_exit(struct net *net)
 415{
 416        remove_proc_entry("rt_cache", net->proc_net_stat);
 417        remove_proc_entry("rt_cache", net->proc_net);
 418#ifdef CONFIG_IP_ROUTE_CLASSID
 419        remove_proc_entry("rt_acct", net->proc_net);
 420#endif
 421}
 422
 423static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 424        .init = ip_rt_do_proc_init,
 425        .exit = ip_rt_do_proc_exit,
 426};
 427
 428static int __init ip_rt_proc_init(void)
 429{
 430        return register_pernet_subsys(&ip_rt_proc_ops);
 431}
 432
 433#else
 434static inline int ip_rt_proc_init(void)
 435{
 436        return 0;
 437}
 438#endif /* CONFIG_PROC_FS */
 439
 440static inline bool rt_is_expired(const struct rtable *rth)
 441{
 442        return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 443}
 444
 445void rt_cache_flush(struct net *net)
 446{
 447        rt_genid_bump(net);
 448}
 449
 450static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 451                                           struct sk_buff *skb,
 452                                           const void *daddr)
 453{
 454        struct net_device *dev = dst->dev;
 455        const __be32 *pkey = daddr;
 456        const struct rtable *rt;
 457        struct neighbour *n;
 458
 459        rt = (const struct rtable *) dst;
 460        if (rt->rt_gateway)
 461                pkey = (const __be32 *) &rt->rt_gateway;
 462        else if (skb)
 463                pkey = &ip_hdr(skb)->daddr;
 464
 465        n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 466        if (n)
 467                return n;
 468        return neigh_create(&arp_tbl, pkey, dev);
 469}
 470
 471/*
 472 * Peer allocation may fail only in serious out-of-memory conditions.  However
 473 * we still can generate some output.
 474 * Random ID selection looks a bit dangerous because we have no chances to
 475 * select ID being unique in a reasonable period of time.
 476 * But broken packet identifier may be better than no packet at all.
 477 */
 478static void ip_select_fb_ident(struct iphdr *iph)
 479{
 480        static DEFINE_SPINLOCK(ip_fb_id_lock);
 481        static u32 ip_fallback_id;
 482        u32 salt;
 483
 484        spin_lock_bh(&ip_fb_id_lock);
 485        salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
 486        iph->id = htons(salt & 0xFFFF);
 487        ip_fallback_id = salt;
 488        spin_unlock_bh(&ip_fb_id_lock);
 489}
 490
 491void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 492{
 493        struct net *net = dev_net(dst->dev);
 494        struct inet_peer *peer;
 495
 496        peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
 497        if (peer) {
 498                iph->id = htons(inet_getid(peer, more));
 499                inet_putpeer(peer);
 500                return;
 501        }
 502
 503        ip_select_fb_ident(iph);
 504}
 505EXPORT_SYMBOL(__ip_select_ident);
 506
 507static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 508                             const struct iphdr *iph,
 509                             int oif, u8 tos,
 510                             u8 prot, u32 mark, int flow_flags)
 511{
 512        if (sk) {
 513                const struct inet_sock *inet = inet_sk(sk);
 514
 515                oif = sk->sk_bound_dev_if;
 516                mark = sk->sk_mark;
 517                tos = RT_CONN_FLAGS(sk);
 518                prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 519        }
 520        flowi4_init_output(fl4, oif, mark, tos,
 521                           RT_SCOPE_UNIVERSE, prot,
 522                           flow_flags,
 523                           iph->daddr, iph->saddr, 0, 0);
 524}
 525
 526static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 527                               const struct sock *sk)
 528{
 529        const struct iphdr *iph = ip_hdr(skb);
 530        int oif = skb->dev->ifindex;
 531        u8 tos = RT_TOS(iph->tos);
 532        u8 prot = iph->protocol;
 533        u32 mark = skb->mark;
 534
 535        __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 536}
 537
 538static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 539{
 540        const struct inet_sock *inet = inet_sk(sk);
 541        const struct ip_options_rcu *inet_opt;
 542        __be32 daddr = inet->inet_daddr;
 543
 544        rcu_read_lock();
 545        inet_opt = rcu_dereference(inet->inet_opt);
 546        if (inet_opt && inet_opt->opt.srr)
 547                daddr = inet_opt->opt.faddr;
 548        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 549                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 550                           inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 551                           inet_sk_flowi_flags(sk),
 552                           daddr, inet->inet_saddr, 0, 0);
 553        rcu_read_unlock();
 554}
 555
 556static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 557                                 const struct sk_buff *skb)
 558{
 559        if (skb)
 560                build_skb_flow_key(fl4, skb, sk);
 561        else
 562                build_sk_flow_key(fl4, sk);
 563}
 564
 565static inline void rt_free(struct rtable *rt)
 566{
 567        call_rcu(&rt->dst.rcu_head, dst_rcu_free);
 568}
 569
 570static DEFINE_SPINLOCK(fnhe_lock);
 571
 572static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 573{
 574        struct fib_nh_exception *fnhe, *oldest;
 575        struct rtable *orig;
 576
 577        oldest = rcu_dereference(hash->chain);
 578        for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 579             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 580                if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 581                        oldest = fnhe;
 582        }
 583        orig = rcu_dereference(oldest->fnhe_rth);
 584        if (orig) {
 585                RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
 586                rt_free(orig);
 587        }
 588        return oldest;
 589}
 590
 591static inline u32 fnhe_hashfun(__be32 daddr)
 592{
 593        u32 hval;
 594
 595        hval = (__force u32) daddr;
 596        hval ^= (hval >> 11) ^ (hval >> 22);
 597
 598        return hval & (FNHE_HASH_SIZE - 1);
 599}
 600
 601static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 602                                  u32 pmtu, unsigned long expires)
 603{
 604        struct fnhe_hash_bucket *hash;
 605        struct fib_nh_exception *fnhe;
 606        int depth;
 607        u32 hval = fnhe_hashfun(daddr);
 608
 609        spin_lock_bh(&fnhe_lock);
 610
 611        hash = nh->nh_exceptions;
 612        if (!hash) {
 613                hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 614                if (!hash)
 615                        goto out_unlock;
 616                nh->nh_exceptions = hash;
 617        }
 618
 619        hash += hval;
 620
 621        depth = 0;
 622        for (fnhe = rcu_dereference(hash->chain); fnhe;
 623             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 624                if (fnhe->fnhe_daddr == daddr)
 625                        break;
 626                depth++;
 627        }
 628
 629        if (fnhe) {
 630                if (gw)
 631                        fnhe->fnhe_gw = gw;
 632                if (pmtu) {
 633                        fnhe->fnhe_pmtu = pmtu;
 634                        fnhe->fnhe_expires = expires;
 635                }
 636        } else {
 637                if (depth > FNHE_RECLAIM_DEPTH)
 638                        fnhe = fnhe_oldest(hash);
 639                else {
 640                        fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 641                        if (!fnhe)
 642                                goto out_unlock;
 643
 644                        fnhe->fnhe_next = hash->chain;
 645                        rcu_assign_pointer(hash->chain, fnhe);
 646                }
 647                fnhe->fnhe_daddr = daddr;
 648                fnhe->fnhe_gw = gw;
 649                fnhe->fnhe_pmtu = pmtu;
 650                fnhe->fnhe_expires = expires;
 651        }
 652
 653        fnhe->fnhe_stamp = jiffies;
 654
 655out_unlock:
 656        spin_unlock_bh(&fnhe_lock);
 657        return;
 658}
 659
 660static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 661                             bool kill_route)
 662{
 663        __be32 new_gw = icmp_hdr(skb)->un.gateway;
 664        __be32 old_gw = ip_hdr(skb)->saddr;
 665        struct net_device *dev = skb->dev;
 666        struct in_device *in_dev;
 667        struct fib_result res;
 668        struct neighbour *n;
 669        struct net *net;
 670
 671        switch (icmp_hdr(skb)->code & 7) {
 672        case ICMP_REDIR_NET:
 673        case ICMP_REDIR_NETTOS:
 674        case ICMP_REDIR_HOST:
 675        case ICMP_REDIR_HOSTTOS:
 676                break;
 677
 678        default:
 679                return;
 680        }
 681
 682        if (rt->rt_gateway != old_gw)
 683                return;
 684
 685        in_dev = __in_dev_get_rcu(dev);
 686        if (!in_dev)
 687                return;
 688
 689        net = dev_net(dev);
 690        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 691            ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 692            ipv4_is_zeronet(new_gw))
 693                goto reject_redirect;
 694
 695        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 696                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 697                        goto reject_redirect;
 698                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 699                        goto reject_redirect;
 700        } else {
 701                if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 702                        goto reject_redirect;
 703        }
 704
 705        n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
 706        if (n) {
 707                if (!(n->nud_state & NUD_VALID)) {
 708                        neigh_event_send(n, NULL);
 709                } else {
 710                        if (fib_lookup(net, fl4, &res) == 0) {
 711                                struct fib_nh *nh = &FIB_RES_NH(res);
 712
 713                                update_or_create_fnhe(nh, fl4->daddr, new_gw,
 714                                                      0, 0);
 715                        }
 716                        if (kill_route)
 717                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 718                        call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 719                }
 720                neigh_release(n);
 721        }
 722        return;
 723
 724reject_redirect:
 725#ifdef CONFIG_IP_ROUTE_VERBOSE
 726        if (IN_DEV_LOG_MARTIANS(in_dev)) {
 727                const struct iphdr *iph = (const struct iphdr *) skb->data;
 728                __be32 daddr = iph->daddr;
 729                __be32 saddr = iph->saddr;
 730
 731                net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 732                                     "  Advised path = %pI4 -> %pI4\n",
 733                                     &old_gw, dev->name, &new_gw,
 734                                     &saddr, &daddr);
 735        }
 736#endif
 737        ;
 738}
 739
 740static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 741{
 742        struct rtable *rt;
 743        struct flowi4 fl4;
 744
 745        rt = (struct rtable *) dst;
 746
 747        ip_rt_build_flow_key(&fl4, sk, skb);
 748        __ip_do_redirect(rt, skb, &fl4, true);
 749}
 750
 751static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 752{
 753        struct rtable *rt = (struct rtable *)dst;
 754        struct dst_entry *ret = dst;
 755
 756        if (rt) {
 757                if (dst->obsolete > 0) {
 758                        ip_rt_put(rt);
 759                        ret = NULL;
 760                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 761                           rt->dst.expires) {
 762                        ip_rt_put(rt);
 763                        ret = NULL;
 764                }
 765        }
 766        return ret;
 767}
 768
 769/*
 770 * Algorithm:
 771 *      1. The first ip_rt_redirect_number redirects are sent
 772 *         with exponential backoff, then we stop sending them at all,
 773 *         assuming that the host ignores our redirects.
 774 *      2. If we did not see packets requiring redirects
 775 *         during ip_rt_redirect_silence, we assume that the host
 776 *         forgot redirected route and start to send redirects again.
 777 *
 778 * This algorithm is much cheaper and more intelligent than dumb load limiting
 779 * in icmp.c.
 780 *
 781 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 782 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 783 */
 784
 785void ip_rt_send_redirect(struct sk_buff *skb)
 786{
 787        struct rtable *rt = skb_rtable(skb);
 788        struct in_device *in_dev;
 789        struct inet_peer *peer;
 790        struct net *net;
 791        int log_martians;
 792
 793        rcu_read_lock();
 794        in_dev = __in_dev_get_rcu(rt->dst.dev);
 795        if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 796                rcu_read_unlock();
 797                return;
 798        }
 799        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 800        rcu_read_unlock();
 801
 802        net = dev_net(rt->dst.dev);
 803        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 804        if (!peer) {
 805                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 806                          rt_nexthop(rt, ip_hdr(skb)->daddr));
 807                return;
 808        }
 809
 810        /* No redirected packets during ip_rt_redirect_silence;
 811         * reset the algorithm.
 812         */
 813        if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 814                peer->rate_tokens = 0;
 815
 816        /* Too many ignored redirects; do not send anything
 817         * set dst.rate_last to the last seen redirected packet.
 818         */
 819        if (peer->rate_tokens >= ip_rt_redirect_number) {
 820                peer->rate_last = jiffies;
 821                goto out_put_peer;
 822        }
 823
 824        /* Check for load limit; set rate_last to the latest sent
 825         * redirect.
 826         */
 827        if (peer->rate_tokens == 0 ||
 828            time_after(jiffies,
 829                       (peer->rate_last +
 830                        (ip_rt_redirect_load << peer->rate_tokens)))) {
 831                __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 832
 833                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 834                peer->rate_last = jiffies;
 835                ++peer->rate_tokens;
 836#ifdef CONFIG_IP_ROUTE_VERBOSE
 837                if (log_martians &&
 838                    peer->rate_tokens == ip_rt_redirect_number)
 839                        net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 840                                             &ip_hdr(skb)->saddr, inet_iif(skb),
 841                                             &ip_hdr(skb)->daddr, &gw);
 842#endif
 843        }
 844out_put_peer:
 845        inet_putpeer(peer);
 846}
 847
 848static int ip_error(struct sk_buff *skb)
 849{
 850        struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 851        struct rtable *rt = skb_rtable(skb);
 852        struct inet_peer *peer;
 853        unsigned long now;
 854        struct net *net;
 855        bool send;
 856        int code;
 857
 858        net = dev_net(rt->dst.dev);
 859        if (!IN_DEV_FORWARD(in_dev)) {
 860                switch (rt->dst.error) {
 861                case EHOSTUNREACH:
 862                        IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
 863                        break;
 864
 865                case ENETUNREACH:
 866                        IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 867                        break;
 868                }
 869                goto out;
 870        }
 871
 872        switch (rt->dst.error) {
 873        case EINVAL:
 874        default:
 875                goto out;
 876        case EHOSTUNREACH:
 877                code = ICMP_HOST_UNREACH;
 878                break;
 879        case ENETUNREACH:
 880                code = ICMP_NET_UNREACH;
 881                IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 882                break;
 883        case EACCES:
 884                code = ICMP_PKT_FILTERED;
 885                break;
 886        }
 887
 888        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 889
 890        send = true;
 891        if (peer) {
 892                now = jiffies;
 893                peer->rate_tokens += now - peer->rate_last;
 894                if (peer->rate_tokens > ip_rt_error_burst)
 895                        peer->rate_tokens = ip_rt_error_burst;
 896                peer->rate_last = now;
 897                if (peer->rate_tokens >= ip_rt_error_cost)
 898                        peer->rate_tokens -= ip_rt_error_cost;
 899                else
 900                        send = false;
 901                inet_putpeer(peer);
 902        }
 903        if (send)
 904                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 905
 906out:    kfree_skb(skb);
 907        return 0;
 908}
 909
 910static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 911{
 912        struct dst_entry *dst = &rt->dst;
 913        struct fib_result res;
 914
 915        if (dst->dev->mtu < mtu)
 916                return;
 917
 918        if (mtu < ip_rt_min_pmtu)
 919                mtu = ip_rt_min_pmtu;
 920
 921        if (!rt->rt_pmtu) {
 922                dst->obsolete = DST_OBSOLETE_KILL;
 923        } else {
 924                rt->rt_pmtu = mtu;
 925                dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
 926        }
 927
 928        rcu_read_lock();
 929        if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
 930                struct fib_nh *nh = &FIB_RES_NH(res);
 931
 932                update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
 933                                      jiffies + ip_rt_mtu_expires);
 934        }
 935        rcu_read_unlock();
 936}
 937
 938static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 939                              struct sk_buff *skb, u32 mtu)
 940{
 941        struct rtable *rt = (struct rtable *) dst;
 942        struct flowi4 fl4;
 943
 944        ip_rt_build_flow_key(&fl4, sk, skb);
 945        __ip_rt_update_pmtu(rt, &fl4, mtu);
 946}
 947
 948void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
 949                      int oif, u32 mark, u8 protocol, int flow_flags)
 950{
 951        const struct iphdr *iph = (const struct iphdr *) skb->data;
 952        struct flowi4 fl4;
 953        struct rtable *rt;
 954
 955        __build_flow_key(&fl4, NULL, iph, oif,
 956                         RT_TOS(iph->tos), protocol, mark, flow_flags);
 957        rt = __ip_route_output_key(net, &fl4);
 958        if (!IS_ERR(rt)) {
 959                __ip_rt_update_pmtu(rt, &fl4, mtu);
 960                ip_rt_put(rt);
 961        }
 962}
 963EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
 964
 965void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 966{
 967        const struct iphdr *iph = (const struct iphdr *) skb->data;
 968        struct flowi4 fl4;
 969        struct rtable *rt;
 970
 971        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
 972        rt = __ip_route_output_key(sock_net(sk), &fl4);
 973        if (!IS_ERR(rt)) {
 974                __ip_rt_update_pmtu(rt, &fl4, mtu);
 975                ip_rt_put(rt);
 976        }
 977}
 978EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
 979
 980void ipv4_redirect(struct sk_buff *skb, struct net *net,
 981                   int oif, u32 mark, u8 protocol, int flow_flags)
 982{
 983        const struct iphdr *iph = (const struct iphdr *) skb->data;
 984        struct flowi4 fl4;
 985        struct rtable *rt;
 986
 987        __build_flow_key(&fl4, NULL, iph, oif,
 988                         RT_TOS(iph->tos), protocol, mark, flow_flags);
 989        rt = __ip_route_output_key(net, &fl4);
 990        if (!IS_ERR(rt)) {
 991                __ip_do_redirect(rt, skb, &fl4, false);
 992                ip_rt_put(rt);
 993        }
 994}
 995EXPORT_SYMBOL_GPL(ipv4_redirect);
 996
 997void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
 998{
 999        const struct iphdr *iph = (const struct iphdr *) skb->data;
1000        struct flowi4 fl4;
1001        struct rtable *rt;
1002
1003        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1004        rt = __ip_route_output_key(sock_net(sk), &fl4);
1005        if (!IS_ERR(rt)) {
1006                __ip_do_redirect(rt, skb, &fl4, false);
1007                ip_rt_put(rt);
1008        }
1009}
1010EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1011
1012static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1013{
1014        struct rtable *rt = (struct rtable *) dst;
1015
1016        /* All IPV4 dsts are created with ->obsolete set to the value
1017         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1018         * into this function always.
1019         *
1020         * When a PMTU/redirect information update invalidates a
1021         * route, this is indicated by setting obsolete to
1022         * DST_OBSOLETE_KILL.
1023         */
1024        if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1025                return NULL;
1026        return dst;
1027}
1028
1029static void ipv4_link_failure(struct sk_buff *skb)
1030{
1031        struct rtable *rt;
1032
1033        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1034
1035        rt = skb_rtable(skb);
1036        if (rt)
1037                dst_set_expires(&rt->dst, 0);
1038}
1039
1040static int ip_rt_bug(struct sk_buff *skb)
1041{
1042        pr_debug("%s: %pI4 -> %pI4, %s\n",
1043                 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1044                 skb->dev ? skb->dev->name : "?");
1045        kfree_skb(skb);
1046        WARN_ON(1);
1047        return 0;
1048}
1049
1050/*
1051   We do not cache source address of outgoing interface,
1052   because it is used only by IP RR, TS and SRR options,
1053   so that it out of fast path.
1054
1055   BTW remember: "addr" is allowed to be not aligned
1056   in IP options!
1057 */
1058
1059void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1060{
1061        __be32 src;
1062
1063        if (rt_is_output_route(rt))
1064                src = ip_hdr(skb)->saddr;
1065        else {
1066                struct fib_result res;
1067                struct flowi4 fl4;
1068                struct iphdr *iph;
1069
1070                iph = ip_hdr(skb);
1071
1072                memset(&fl4, 0, sizeof(fl4));
1073                fl4.daddr = iph->daddr;
1074                fl4.saddr = iph->saddr;
1075                fl4.flowi4_tos = RT_TOS(iph->tos);
1076                fl4.flowi4_oif = rt->dst.dev->ifindex;
1077                fl4.flowi4_iif = skb->dev->ifindex;
1078                fl4.flowi4_mark = skb->mark;
1079
1080                rcu_read_lock();
1081                if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1082                        src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1083                else
1084                        src = inet_select_addr(rt->dst.dev,
1085                                               rt_nexthop(rt, iph->daddr),
1086                                               RT_SCOPE_UNIVERSE);
1087                rcu_read_unlock();
1088        }
1089        memcpy(addr, &src, 4);
1090}
1091
1092#ifdef CONFIG_IP_ROUTE_CLASSID
1093static void set_class_tag(struct rtable *rt, u32 tag)
1094{
1095        if (!(rt->dst.tclassid & 0xFFFF))
1096                rt->dst.tclassid |= tag & 0xFFFF;
1097        if (!(rt->dst.tclassid & 0xFFFF0000))
1098                rt->dst.tclassid |= tag & 0xFFFF0000;
1099}
1100#endif
1101
1102static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1103{
1104        unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1105
1106        if (advmss == 0) {
1107                advmss = max_t(unsigned int, dst->dev->mtu - 40,
1108                               ip_rt_min_advmss);
1109                if (advmss > 65535 - 40)
1110                        advmss = 65535 - 40;
1111        }
1112        return advmss;
1113}
1114
1115static unsigned int ipv4_mtu(const struct dst_entry *dst)
1116{
1117        const struct rtable *rt = (const struct rtable *) dst;
1118        unsigned int mtu = rt->rt_pmtu;
1119
1120        if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1121                mtu = dst_metric_raw(dst, RTAX_MTU);
1122
1123        if (mtu && rt_is_output_route(rt))
1124                return mtu;
1125
1126        mtu = dst->dev->mtu;
1127
1128        if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1129                if (rt->rt_uses_gateway && mtu > 576)
1130                        mtu = 576;
1131        }
1132
1133        if (mtu > IP_MAX_MTU)
1134                mtu = IP_MAX_MTU;
1135
1136        return mtu;
1137}
1138
1139static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1140{
1141        struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1142        struct fib_nh_exception *fnhe;
1143        u32 hval;
1144
1145        if (!hash)
1146                return NULL;
1147
1148        hval = fnhe_hashfun(daddr);
1149
1150        for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1151             fnhe = rcu_dereference(fnhe->fnhe_next)) {
1152                if (fnhe->fnhe_daddr == daddr)
1153                        return fnhe;
1154        }
1155        return NULL;
1156}
1157
1158static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1159                              __be32 daddr)
1160{
1161        bool ret = false;
1162
1163        spin_lock_bh(&fnhe_lock);
1164
1165        if (daddr == fnhe->fnhe_daddr) {
1166                struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
1167                if (orig && rt_is_expired(orig)) {
1168                        fnhe->fnhe_gw = 0;
1169                        fnhe->fnhe_pmtu = 0;
1170                        fnhe->fnhe_expires = 0;
1171                }
1172                if (fnhe->fnhe_pmtu) {
1173                        unsigned long expires = fnhe->fnhe_expires;
1174                        unsigned long diff = expires - jiffies;
1175
1176                        if (time_before(jiffies, expires)) {
1177                                rt->rt_pmtu = fnhe->fnhe_pmtu;
1178                                dst_set_expires(&rt->dst, diff);
1179                        }
1180                }
1181                if (fnhe->fnhe_gw) {
1182                        rt->rt_flags |= RTCF_REDIRECTED;
1183                        rt->rt_gateway = fnhe->fnhe_gw;
1184                        rt->rt_uses_gateway = 1;
1185                } else if (!rt->rt_gateway)
1186                        rt->rt_gateway = daddr;
1187
1188                rcu_assign_pointer(fnhe->fnhe_rth, rt);
1189                if (orig)
1190                        rt_free(orig);
1191
1192                fnhe->fnhe_stamp = jiffies;
1193                ret = true;
1194        }
1195        spin_unlock_bh(&fnhe_lock);
1196
1197        return ret;
1198}
1199
1200static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1201{
1202        struct rtable *orig, *prev, **p;
1203        bool ret = true;
1204
1205        if (rt_is_input_route(rt)) {
1206                p = (struct rtable **)&nh->nh_rth_input;
1207        } else {
1208                p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1209        }
1210        orig = *p;
1211
1212        prev = cmpxchg(p, orig, rt);
1213        if (prev == orig) {
1214                if (orig)
1215                        rt_free(orig);
1216        } else
1217                ret = false;
1218
1219        return ret;
1220}
1221
1222static DEFINE_SPINLOCK(rt_uncached_lock);
1223static LIST_HEAD(rt_uncached_list);
1224
1225static void rt_add_uncached_list(struct rtable *rt)
1226{
1227        spin_lock_bh(&rt_uncached_lock);
1228        list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1229        spin_unlock_bh(&rt_uncached_lock);
1230}
1231
1232static void ipv4_dst_destroy(struct dst_entry *dst)
1233{
1234        struct rtable *rt = (struct rtable *) dst;
1235
1236        if (!list_empty(&rt->rt_uncached)) {
1237                spin_lock_bh(&rt_uncached_lock);
1238                list_del(&rt->rt_uncached);
1239                spin_unlock_bh(&rt_uncached_lock);
1240        }
1241}
1242
1243void rt_flush_dev(struct net_device *dev)
1244{
1245        if (!list_empty(&rt_uncached_list)) {
1246                struct net *net = dev_net(dev);
1247                struct rtable *rt;
1248
1249                spin_lock_bh(&rt_uncached_lock);
1250                list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1251                        if (rt->dst.dev != dev)
1252                                continue;
1253                        rt->dst.dev = net->loopback_dev;
1254                        dev_hold(rt->dst.dev);
1255                        dev_put(dev);
1256                }
1257                spin_unlock_bh(&rt_uncached_lock);
1258        }
1259}
1260
1261static bool rt_cache_valid(const struct rtable *rt)
1262{
1263        return  rt &&
1264                rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1265                !rt_is_expired(rt);
1266}
1267
1268static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1269                           const struct fib_result *res,
1270                           struct fib_nh_exception *fnhe,
1271                           struct fib_info *fi, u16 type, u32 itag)
1272{
1273        bool cached = false;
1274
1275        if (fi) {
1276                struct fib_nh *nh = &FIB_RES_NH(*res);
1277
1278                if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1279                        rt->rt_gateway = nh->nh_gw;
1280                        rt->rt_uses_gateway = 1;
1281                }
1282                dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1283#ifdef CONFIG_IP_ROUTE_CLASSID
1284                rt->dst.tclassid = nh->nh_tclassid;
1285#endif
1286                if (unlikely(fnhe))
1287                        cached = rt_bind_exception(rt, fnhe, daddr);
1288                else if (!(rt->dst.flags & DST_NOCACHE))
1289                        cached = rt_cache_route(nh, rt);
1290                if (unlikely(!cached)) {
1291                        /* Routes we intend to cache in nexthop exception or
1292                         * FIB nexthop have the DST_NOCACHE bit clear.
1293                         * However, if we are unsuccessful at storing this
1294                         * route into the cache we really need to set it.
1295                         */
1296                        rt->dst.flags |= DST_NOCACHE;
1297                        if (!rt->rt_gateway)
1298                                rt->rt_gateway = daddr;
1299                        rt_add_uncached_list(rt);
1300                }
1301        } else
1302                rt_add_uncached_list(rt);
1303
1304#ifdef CONFIG_IP_ROUTE_CLASSID
1305#ifdef CONFIG_IP_MULTIPLE_TABLES
1306        set_class_tag(rt, res->tclassid);
1307#endif
1308        set_class_tag(rt, itag);
1309#endif
1310}
1311
1312static struct rtable *rt_dst_alloc(struct net_device *dev,
1313                                   bool nopolicy, bool noxfrm, bool will_cache)
1314{
1315        return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1316                         (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1317                         (nopolicy ? DST_NOPOLICY : 0) |
1318                         (noxfrm ? DST_NOXFRM : 0));
1319}
1320
1321/* called in rcu_read_lock() section */
1322static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1323                                u8 tos, struct net_device *dev, int our)
1324{
1325        struct rtable *rth;
1326        struct in_device *in_dev = __in_dev_get_rcu(dev);
1327        u32 itag = 0;
1328        int err;
1329
1330        /* Primary sanity checks. */
1331
1332        if (in_dev == NULL)
1333                return -EINVAL;
1334
1335        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1336            skb->protocol != htons(ETH_P_IP))
1337                goto e_inval;
1338
1339        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1340                if (ipv4_is_loopback(saddr))
1341                        goto e_inval;
1342
1343        if (ipv4_is_zeronet(saddr)) {
1344                if (!ipv4_is_local_multicast(daddr))
1345                        goto e_inval;
1346        } else {
1347                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1348                                          in_dev, &itag);
1349                if (err < 0)
1350                        goto e_err;
1351        }
1352        rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1353                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1354        if (!rth)
1355                goto e_nobufs;
1356
1357#ifdef CONFIG_IP_ROUTE_CLASSID
1358        rth->dst.tclassid = itag;
1359#endif
1360        rth->dst.output = ip_rt_bug;
1361
1362        rth->rt_genid   = rt_genid(dev_net(dev));
1363        rth->rt_flags   = RTCF_MULTICAST;
1364        rth->rt_type    = RTN_MULTICAST;
1365        rth->rt_is_input= 1;
1366        rth->rt_iif     = 0;
1367        rth->rt_pmtu    = 0;
1368        rth->rt_gateway = 0;
1369        rth->rt_uses_gateway = 0;
1370        INIT_LIST_HEAD(&rth->rt_uncached);
1371        if (our) {
1372                rth->dst.input= ip_local_deliver;
1373                rth->rt_flags |= RTCF_LOCAL;
1374        }
1375
1376#ifdef CONFIG_IP_MROUTE
1377        if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1378                rth->dst.input = ip_mr_input;
1379#endif
1380        RT_CACHE_STAT_INC(in_slow_mc);
1381
1382        skb_dst_set(skb, &rth->dst);
1383        return 0;
1384
1385e_nobufs:
1386        return -ENOBUFS;
1387e_inval:
1388        return -EINVAL;
1389e_err:
1390        return err;
1391}
1392
1393
1394static void ip_handle_martian_source(struct net_device *dev,
1395                                     struct in_device *in_dev,
1396                                     struct sk_buff *skb,
1397                                     __be32 daddr,
1398                                     __be32 saddr)
1399{
1400        RT_CACHE_STAT_INC(in_martian_src);
1401#ifdef CONFIG_IP_ROUTE_VERBOSE
1402        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1403                /*
1404                 *      RFC1812 recommendation, if source is martian,
1405                 *      the only hint is MAC header.
1406                 */
1407                pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1408                        &daddr, &saddr, dev->name);
1409                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1410                        print_hex_dump(KERN_WARNING, "ll header: ",
1411                                       DUMP_PREFIX_OFFSET, 16, 1,
1412                                       skb_mac_header(skb),
1413                                       dev->hard_header_len, true);
1414                }
1415        }
1416#endif
1417}
1418
1419/* called in rcu_read_lock() section */
1420static int __mkroute_input(struct sk_buff *skb,
1421                           const struct fib_result *res,
1422                           struct in_device *in_dev,
1423                           __be32 daddr, __be32 saddr, u32 tos)
1424{
1425        struct rtable *rth;
1426        int err;
1427        struct in_device *out_dev;
1428        unsigned int flags = 0;
1429        bool do_cache;
1430        u32 itag;
1431
1432        /* get a working reference to the output device */
1433        out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1434        if (out_dev == NULL) {
1435                net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1436                return -EINVAL;
1437        }
1438
1439        err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1440                                  in_dev->dev, in_dev, &itag);
1441        if (err < 0) {
1442                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1443                                         saddr);
1444
1445                goto cleanup;
1446        }
1447
1448        do_cache = res->fi && !itag;
1449        if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1450            (IN_DEV_SHARED_MEDIA(out_dev) ||
1451             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1452                flags |= RTCF_DOREDIRECT;
1453                do_cache = false;
1454        }
1455
1456        if (skb->protocol != htons(ETH_P_IP)) {
1457                /* Not IP (i.e. ARP). Do not create route, if it is
1458                 * invalid for proxy arp. DNAT routes are always valid.
1459                 *
1460                 * Proxy arp feature have been extended to allow, ARP
1461                 * replies back to the same interface, to support
1462                 * Private VLAN switch technologies. See arp.c.
1463                 */
1464                if (out_dev == in_dev &&
1465                    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1466                        err = -EINVAL;
1467                        goto cleanup;
1468                }
1469        }
1470
1471        if (do_cache) {
1472                rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1473                if (rt_cache_valid(rth)) {
1474                        skb_dst_set_noref(skb, &rth->dst);
1475                        goto out;
1476                }
1477        }
1478
1479        rth = rt_dst_alloc(out_dev->dev,
1480                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
1481                           IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1482        if (!rth) {
1483                err = -ENOBUFS;
1484                goto cleanup;
1485        }
1486
1487        rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1488        rth->rt_flags = flags;
1489        rth->rt_type = res->type;
1490        rth->rt_is_input = 1;
1491        rth->rt_iif     = 0;
1492        rth->rt_pmtu    = 0;
1493        rth->rt_gateway = 0;
1494        rth->rt_uses_gateway = 0;
1495        INIT_LIST_HEAD(&rth->rt_uncached);
1496
1497        rth->dst.input = ip_forward;
1498        rth->dst.output = ip_output;
1499
1500        rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1501        skb_dst_set(skb, &rth->dst);
1502out:
1503        err = 0;
1504 cleanup:
1505        return err;
1506}
1507
1508static int ip_mkroute_input(struct sk_buff *skb,
1509                            struct fib_result *res,
1510                            const struct flowi4 *fl4,
1511                            struct in_device *in_dev,
1512                            __be32 daddr, __be32 saddr, u32 tos)
1513{
1514#ifdef CONFIG_IP_ROUTE_MULTIPATH
1515        if (res->fi && res->fi->fib_nhs > 1)
1516                fib_select_multipath(res);
1517#endif
1518
1519        /* create a routing cache entry */
1520        return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1521}
1522
1523/*
1524 *      NOTE. We drop all the packets that has local source
1525 *      addresses, because every properly looped back packet
1526 *      must have correct destination already attached by output routine.
1527 *
1528 *      Such approach solves two big problems:
1529 *      1. Not simplex devices are handled properly.
1530 *      2. IP spoofing attempts are filtered with 100% of guarantee.
1531 *      called with rcu_read_lock()
1532 */
1533
1534static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1535                               u8 tos, struct net_device *dev)
1536{
1537        struct fib_result res;
1538        struct in_device *in_dev = __in_dev_get_rcu(dev);
1539        struct flowi4   fl4;
1540        unsigned int    flags = 0;
1541        u32             itag = 0;
1542        struct rtable   *rth;
1543        int             err = -EINVAL;
1544        struct net    *net = dev_net(dev);
1545        bool do_cache;
1546
1547        /* IP on this device is disabled. */
1548
1549        if (!in_dev)
1550                goto out;
1551
1552        /* Check for the most weird martians, which can be not detected
1553           by fib_lookup.
1554         */
1555
1556        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1557                goto martian_source;
1558
1559        res.fi = NULL;
1560        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1561                goto brd_input;
1562
1563        /* Accept zero addresses only to limited broadcast;
1564         * I even do not know to fix it or not. Waiting for complains :-)
1565         */
1566        if (ipv4_is_zeronet(saddr))
1567                goto martian_source;
1568
1569        if (ipv4_is_zeronet(daddr))
1570                goto martian_destination;
1571
1572        /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1573         * and call it once if daddr or/and saddr are loopback addresses
1574         */
1575        if (ipv4_is_loopback(daddr)) {
1576                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1577                        goto martian_destination;
1578        } else if (ipv4_is_loopback(saddr)) {
1579                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1580                        goto martian_source;
1581        }
1582
1583        /*
1584         *      Now we are ready to route packet.
1585         */
1586        fl4.flowi4_oif = 0;
1587        fl4.flowi4_iif = dev->ifindex;
1588        fl4.flowi4_mark = skb->mark;
1589        fl4.flowi4_tos = tos;
1590        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1591        fl4.daddr = daddr;
1592        fl4.saddr = saddr;
1593        err = fib_lookup(net, &fl4, &res);
1594        if (err != 0)
1595                goto no_route;
1596
1597        RT_CACHE_STAT_INC(in_slow_tot);
1598
1599        if (res.type == RTN_BROADCAST)
1600                goto brd_input;
1601
1602        if (res.type == RTN_LOCAL) {
1603                err = fib_validate_source(skb, saddr, daddr, tos,
1604                                          LOOPBACK_IFINDEX,
1605                                          dev, in_dev, &itag);
1606                if (err < 0)
1607                        goto martian_source_keep_err;
1608                goto local_input;
1609        }
1610
1611        if (!IN_DEV_FORWARD(in_dev))
1612                goto no_route;
1613        if (res.type != RTN_UNICAST)
1614                goto martian_destination;
1615
1616        err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1617out:    return err;
1618
1619brd_input:
1620        if (skb->protocol != htons(ETH_P_IP))
1621                goto e_inval;
1622
1623        if (!ipv4_is_zeronet(saddr)) {
1624                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1625                                          in_dev, &itag);
1626                if (err < 0)
1627                        goto martian_source_keep_err;
1628        }
1629        flags |= RTCF_BROADCAST;
1630        res.type = RTN_BROADCAST;
1631        RT_CACHE_STAT_INC(in_brd);
1632
1633local_input:
1634        do_cache = false;
1635        if (res.fi) {
1636                if (!itag) {
1637                        rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1638                        if (rt_cache_valid(rth)) {
1639                                skb_dst_set_noref(skb, &rth->dst);
1640                                err = 0;
1641                                goto out;
1642                        }
1643                        do_cache = true;
1644                }
1645        }
1646
1647        rth = rt_dst_alloc(net->loopback_dev,
1648                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1649        if (!rth)
1650                goto e_nobufs;
1651
1652        rth->dst.input= ip_local_deliver;
1653        rth->dst.output= ip_rt_bug;
1654#ifdef CONFIG_IP_ROUTE_CLASSID
1655        rth->dst.tclassid = itag;
1656#endif
1657
1658        rth->rt_genid = rt_genid(net);
1659        rth->rt_flags   = flags|RTCF_LOCAL;
1660        rth->rt_type    = res.type;
1661        rth->rt_is_input = 1;
1662        rth->rt_iif     = 0;
1663        rth->rt_pmtu    = 0;
1664        rth->rt_gateway = 0;
1665        rth->rt_uses_gateway = 0;
1666        INIT_LIST_HEAD(&rth->rt_uncached);
1667        if (res.type == RTN_UNREACHABLE) {
1668                rth->dst.input= ip_error;
1669                rth->dst.error= -err;
1670                rth->rt_flags   &= ~RTCF_LOCAL;
1671        }
1672        if (do_cache)
1673                rt_cache_route(&FIB_RES_NH(res), rth);
1674        skb_dst_set(skb, &rth->dst);
1675        err = 0;
1676        goto out;
1677
1678no_route:
1679        RT_CACHE_STAT_INC(in_no_route);
1680        res.type = RTN_UNREACHABLE;
1681        if (err == -ESRCH)
1682                err = -ENETUNREACH;
1683        goto local_input;
1684
1685        /*
1686         *      Do not cache martian addresses: they should be logged (RFC1812)
1687         */
1688martian_destination:
1689        RT_CACHE_STAT_INC(in_martian_dst);
1690#ifdef CONFIG_IP_ROUTE_VERBOSE
1691        if (IN_DEV_LOG_MARTIANS(in_dev))
1692                net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1693                                     &daddr, &saddr, dev->name);
1694#endif
1695
1696e_inval:
1697        err = -EINVAL;
1698        goto out;
1699
1700e_nobufs:
1701        err = -ENOBUFS;
1702        goto out;
1703
1704martian_source:
1705        err = -EINVAL;
1706martian_source_keep_err:
1707        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1708        goto out;
1709}
1710
1711int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1712                         u8 tos, struct net_device *dev)
1713{
1714        int res;
1715
1716        rcu_read_lock();
1717
1718        /* Multicast recognition logic is moved from route cache to here.
1719           The problem was that too many Ethernet cards have broken/missing
1720           hardware multicast filters :-( As result the host on multicasting
1721           network acquires a lot of useless route cache entries, sort of
1722           SDR messages from all the world. Now we try to get rid of them.
1723           Really, provided software IP multicast filter is organized
1724           reasonably (at least, hashed), it does not result in a slowdown
1725           comparing with route cache reject entries.
1726           Note, that multicast routers are not affected, because
1727           route cache entry is created eventually.
1728         */
1729        if (ipv4_is_multicast(daddr)) {
1730                struct in_device *in_dev = __in_dev_get_rcu(dev);
1731
1732                if (in_dev) {
1733                        int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1734                                                  ip_hdr(skb)->protocol);
1735                        if (our
1736#ifdef CONFIG_IP_MROUTE
1737                                ||
1738                            (!ipv4_is_local_multicast(daddr) &&
1739                             IN_DEV_MFORWARD(in_dev))
1740#endif
1741                           ) {
1742                                int res = ip_route_input_mc(skb, daddr, saddr,
1743                                                            tos, dev, our);
1744                                rcu_read_unlock();
1745                                return res;
1746                        }
1747                }
1748                rcu_read_unlock();
1749                return -EINVAL;
1750        }
1751        res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1752        rcu_read_unlock();
1753        return res;
1754}
1755EXPORT_SYMBOL(ip_route_input_noref);
1756
1757/* called with rcu_read_lock() */
1758static struct rtable *__mkroute_output(const struct fib_result *res,
1759                                       const struct flowi4 *fl4, int orig_oif,
1760                                       struct net_device *dev_out,
1761                                       unsigned int flags)
1762{
1763        struct fib_info *fi = res->fi;
1764        struct fib_nh_exception *fnhe;
1765        struct in_device *in_dev;
1766        u16 type = res->type;
1767        struct rtable *rth;
1768        bool do_cache;
1769
1770        in_dev = __in_dev_get_rcu(dev_out);
1771        if (!in_dev)
1772                return ERR_PTR(-EINVAL);
1773
1774        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1775                if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1776                        return ERR_PTR(-EINVAL);
1777
1778        if (ipv4_is_lbcast(fl4->daddr))
1779                type = RTN_BROADCAST;
1780        else if (ipv4_is_multicast(fl4->daddr))
1781                type = RTN_MULTICAST;
1782        else if (ipv4_is_zeronet(fl4->daddr))
1783                return ERR_PTR(-EINVAL);
1784
1785        if (dev_out->flags & IFF_LOOPBACK)
1786                flags |= RTCF_LOCAL;
1787
1788        do_cache = true;
1789        if (type == RTN_BROADCAST) {
1790                flags |= RTCF_BROADCAST | RTCF_LOCAL;
1791                fi = NULL;
1792        } else if (type == RTN_MULTICAST) {
1793                flags |= RTCF_MULTICAST | RTCF_LOCAL;
1794                if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1795                                     fl4->flowi4_proto))
1796                        flags &= ~RTCF_LOCAL;
1797                else
1798                        do_cache = false;
1799                /* If multicast route do not exist use
1800                 * default one, but do not gateway in this case.
1801                 * Yes, it is hack.
1802                 */
1803                if (fi && res->prefixlen < 4)
1804                        fi = NULL;
1805        }
1806
1807        fnhe = NULL;
1808        do_cache &= fi != NULL;
1809        if (do_cache) {
1810                struct rtable __rcu **prth;
1811                struct fib_nh *nh = &FIB_RES_NH(*res);
1812
1813                fnhe = find_exception(nh, fl4->daddr);
1814                if (fnhe)
1815                        prth = &fnhe->fnhe_rth;
1816                else {
1817                        if (unlikely(fl4->flowi4_flags &
1818                                     FLOWI_FLAG_KNOWN_NH &&
1819                                     !(nh->nh_gw &&
1820                                       nh->nh_scope == RT_SCOPE_LINK))) {
1821                                do_cache = false;
1822                                goto add;
1823                        }
1824                        prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1825                }
1826                rth = rcu_dereference(*prth);
1827                if (rt_cache_valid(rth)) {
1828                        dst_hold(&rth->dst);
1829                        return rth;
1830                }
1831        }
1832
1833add:
1834        rth = rt_dst_alloc(dev_out,
1835                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
1836                           IN_DEV_CONF_GET(in_dev, NOXFRM),
1837                           do_cache);
1838        if (!rth)
1839                return ERR_PTR(-ENOBUFS);
1840
1841        rth->dst.output = ip_output;
1842
1843        rth->rt_genid = rt_genid(dev_net(dev_out));
1844        rth->rt_flags   = flags;
1845        rth->rt_type    = type;
1846        rth->rt_is_input = 0;
1847        rth->rt_iif     = orig_oif ? : 0;
1848        rth->rt_pmtu    = 0;
1849        rth->rt_gateway = 0;
1850        rth->rt_uses_gateway = 0;
1851        INIT_LIST_HEAD(&rth->rt_uncached);
1852
1853        RT_CACHE_STAT_INC(out_slow_tot);
1854
1855        if (flags & RTCF_LOCAL)
1856                rth->dst.input = ip_local_deliver;
1857        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1858                if (flags & RTCF_LOCAL &&
1859                    !(dev_out->flags & IFF_LOOPBACK)) {
1860                        rth->dst.output = ip_mc_output;
1861                        RT_CACHE_STAT_INC(out_slow_mc);
1862                }
1863#ifdef CONFIG_IP_MROUTE
1864                if (type == RTN_MULTICAST) {
1865                        if (IN_DEV_MFORWARD(in_dev) &&
1866                            !ipv4_is_local_multicast(fl4->daddr)) {
1867                                rth->dst.input = ip_mr_input;
1868                                rth->dst.output = ip_mc_output;
1869                        }
1870                }
1871#endif
1872        }
1873
1874        rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1875
1876        return rth;
1877}
1878
1879/*
1880 * Major route resolver routine.
1881 */
1882
1883struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1884{
1885        struct net_device *dev_out = NULL;
1886        __u8 tos = RT_FL_TOS(fl4);
1887        unsigned int flags = 0;
1888        struct fib_result res;
1889        struct rtable *rth;
1890        int orig_oif;
1891
1892        res.tclassid    = 0;
1893        res.fi          = NULL;
1894        res.table       = NULL;
1895
1896        orig_oif = fl4->flowi4_oif;
1897
1898        fl4->flowi4_iif = LOOPBACK_IFINDEX;
1899        fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1900        fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1901                         RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1902
1903        rcu_read_lock();
1904        if (fl4->saddr) {
1905                rth = ERR_PTR(-EINVAL);
1906                if (ipv4_is_multicast(fl4->saddr) ||
1907                    ipv4_is_lbcast(fl4->saddr) ||
1908                    ipv4_is_zeronet(fl4->saddr))
1909                        goto out;
1910
1911                /* I removed check for oif == dev_out->oif here.
1912                   It was wrong for two reasons:
1913                   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1914                      is assigned to multiple interfaces.
1915                   2. Moreover, we are allowed to send packets with saddr
1916                      of another iface. --ANK
1917                 */
1918
1919                if (fl4->flowi4_oif == 0 &&
1920                    (ipv4_is_multicast(fl4->daddr) ||
1921                     ipv4_is_lbcast(fl4->daddr))) {
1922                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1923                        dev_out = __ip_dev_find(net, fl4->saddr, false);
1924                        if (dev_out == NULL)
1925                                goto out;
1926
1927                        /* Special hack: user can direct multicasts
1928                           and limited broadcast via necessary interface
1929                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1930                           This hack is not just for fun, it allows
1931                           vic,vat and friends to work.
1932                           They bind socket to loopback, set ttl to zero
1933                           and expect that it will work.
1934                           From the viewpoint of routing cache they are broken,
1935                           because we are not allowed to build multicast path
1936                           with loopback source addr (look, routing cache
1937                           cannot know, that ttl is zero, so that packet
1938                           will not leave this host and route is valid).
1939                           Luckily, this hack is good workaround.
1940                         */
1941
1942                        fl4->flowi4_oif = dev_out->ifindex;
1943                        goto make_route;
1944                }
1945
1946                if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1947                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1948                        if (!__ip_dev_find(net, fl4->saddr, false))
1949                                goto out;
1950                }
1951        }
1952
1953
1954        if (fl4->flowi4_oif) {
1955                dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
1956                rth = ERR_PTR(-ENODEV);
1957                if (dev_out == NULL)
1958                        goto out;
1959
1960                /* RACE: Check return value of inet_select_addr instead. */
1961                if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
1962                        rth = ERR_PTR(-ENETUNREACH);
1963                        goto out;
1964                }
1965                if (ipv4_is_local_multicast(fl4->daddr) ||
1966                    ipv4_is_lbcast(fl4->daddr)) {
1967                        if (!fl4->saddr)
1968                                fl4->saddr = inet_select_addr(dev_out, 0,
1969                                                              RT_SCOPE_LINK);
1970                        goto make_route;
1971                }
1972                if (fl4->saddr) {
1973                        if (ipv4_is_multicast(fl4->daddr))
1974                                fl4->saddr = inet_select_addr(dev_out, 0,
1975                                                              fl4->flowi4_scope);
1976                        else if (!fl4->daddr)
1977                                fl4->saddr = inet_select_addr(dev_out, 0,
1978                                                              RT_SCOPE_HOST);
1979                }
1980        }
1981
1982        if (!fl4->daddr) {
1983                fl4->daddr = fl4->saddr;
1984                if (!fl4->daddr)
1985                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1986                dev_out = net->loopback_dev;
1987                fl4->flowi4_oif = LOOPBACK_IFINDEX;
1988                res.type = RTN_LOCAL;
1989                flags |= RTCF_LOCAL;
1990                goto make_route;
1991        }
1992
1993        if (fib_lookup(net, fl4, &res)) {
1994                res.fi = NULL;
1995                res.table = NULL;
1996                if (fl4->flowi4_oif) {
1997                        /* Apparently, routing tables are wrong. Assume,
1998                           that the destination is on link.
1999
2000                           WHY? DW.
2001                           Because we are allowed to send to iface
2002                           even if it has NO routes and NO assigned
2003                           addresses. When oif is specified, routing
2004                           tables are looked up with only one purpose:
2005                           to catch if destination is gatewayed, rather than
2006                           direct. Moreover, if MSG_DONTROUTE is set,
2007                           we send packet, ignoring both routing tables
2008                           and ifaddr state. --ANK
2009
2010
2011                           We could make it even if oif is unknown,
2012                           likely IPv6, but we do not.
2013                         */
2014
2015                        if (fl4->saddr == 0)
2016                                fl4->saddr = inet_select_addr(dev_out, 0,
2017                                                              RT_SCOPE_LINK);
2018                        res.type = RTN_UNICAST;
2019                        goto make_route;
2020                }
2021                rth = ERR_PTR(-ENETUNREACH);
2022                goto out;
2023        }
2024
2025        if (res.type == RTN_LOCAL) {
2026                if (!fl4->saddr) {
2027                        if (res.fi->fib_prefsrc)
2028                                fl4->saddr = res.fi->fib_prefsrc;
2029                        else
2030                                fl4->saddr = fl4->daddr;
2031                }
2032                dev_out = net->loopback_dev;
2033                fl4->flowi4_oif = dev_out->ifindex;
2034                flags |= RTCF_LOCAL;
2035                goto make_route;
2036        }
2037
2038#ifdef CONFIG_IP_ROUTE_MULTIPATH
2039        if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2040                fib_select_multipath(&res);
2041        else
2042#endif
2043        if (!res.prefixlen &&
2044            res.table->tb_num_default > 1 &&
2045            res.type == RTN_UNICAST && !fl4->flowi4_oif)
2046                fib_select_default(&res);
2047
2048        if (!fl4->saddr)
2049                fl4->saddr = FIB_RES_PREFSRC(net, res);
2050
2051        dev_out = FIB_RES_DEV(res);
2052        fl4->flowi4_oif = dev_out->ifindex;
2053
2054
2055make_route:
2056        rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2057
2058out:
2059        rcu_read_unlock();
2060        return rth;
2061}
2062EXPORT_SYMBOL_GPL(__ip_route_output_key);
2063
2064static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2065{
2066        return NULL;
2067}
2068
2069static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2070{
2071        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2072
2073        return mtu ? : dst->dev->mtu;
2074}
2075
2076static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2077                                          struct sk_buff *skb, u32 mtu)
2078{
2079}
2080
2081static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2082                                       struct sk_buff *skb)
2083{
2084}
2085
2086static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2087                                          unsigned long old)
2088{
2089        return NULL;
2090}
2091
2092static struct dst_ops ipv4_dst_blackhole_ops = {
2093        .family                 =       AF_INET,
2094        .protocol               =       cpu_to_be16(ETH_P_IP),
2095        .check                  =       ipv4_blackhole_dst_check,
2096        .mtu                    =       ipv4_blackhole_mtu,
2097        .default_advmss         =       ipv4_default_advmss,
2098        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2099        .redirect               =       ipv4_rt_blackhole_redirect,
2100        .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2101        .neigh_lookup           =       ipv4_neigh_lookup,
2102};
2103
2104struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2105{
2106        struct rtable *ort = (struct rtable *) dst_orig;
2107        struct rtable *rt;
2108
2109        rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2110        if (rt) {
2111                struct dst_entry *new = &rt->dst;
2112
2113                new->__use = 1;
2114                new->input = dst_discard;
2115                new->output = dst_discard;
2116
2117                new->dev = ort->dst.dev;
2118                if (new->dev)
2119                        dev_hold(new->dev);
2120
2121                rt->rt_is_input = ort->rt_is_input;
2122                rt->rt_iif = ort->rt_iif;
2123                rt->rt_pmtu = ort->rt_pmtu;
2124
2125                rt->rt_genid = rt_genid(net);
2126                rt->rt_flags = ort->rt_flags;
2127                rt->rt_type = ort->rt_type;
2128                rt->rt_gateway = ort->rt_gateway;
2129                rt->rt_uses_gateway = ort->rt_uses_gateway;
2130
2131                INIT_LIST_HEAD(&rt->rt_uncached);
2132
2133                dst_free(new);
2134        }
2135
2136        dst_release(dst_orig);
2137
2138        return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2139}
2140
2141struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2142                                    struct sock *sk)
2143{
2144        struct rtable *rt = __ip_route_output_key(net, flp4);
2145
2146        if (IS_ERR(rt))
2147                return rt;
2148
2149        if (flp4->flowi4_proto)
2150                rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2151                                                   flowi4_to_flowi(flp4),
2152                                                   sk, 0);
2153
2154        return rt;
2155}
2156EXPORT_SYMBOL_GPL(ip_route_output_flow);
2157
2158static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2159                        struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2160                        u32 seq, int event, int nowait, unsigned int flags)
2161{
2162        struct rtable *rt = skb_rtable(skb);
2163        struct rtmsg *r;
2164        struct nlmsghdr *nlh;
2165        unsigned long expires = 0;
2166        u32 error;
2167        u32 metrics[RTAX_MAX];
2168
2169        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2170        if (nlh == NULL)
2171                return -EMSGSIZE;
2172
2173        r = nlmsg_data(nlh);
2174        r->rtm_family    = AF_INET;
2175        r->rtm_dst_len  = 32;
2176        r->rtm_src_len  = 0;
2177        r->rtm_tos      = fl4->flowi4_tos;
2178        r->rtm_table    = RT_TABLE_MAIN;
2179        if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2180                goto nla_put_failure;
2181        r->rtm_type     = rt->rt_type;
2182        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2183        r->rtm_protocol = RTPROT_UNSPEC;
2184        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2185        if (rt->rt_flags & RTCF_NOTIFY)
2186                r->rtm_flags |= RTM_F_NOTIFY;
2187
2188        if (nla_put_be32(skb, RTA_DST, dst))
2189                goto nla_put_failure;
2190        if (src) {
2191                r->rtm_src_len = 32;
2192                if (nla_put_be32(skb, RTA_SRC, src))
2193                        goto nla_put_failure;
2194        }
2195        if (rt->dst.dev &&
2196            nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2197                goto nla_put_failure;
2198#ifdef CONFIG_IP_ROUTE_CLASSID
2199        if (rt->dst.tclassid &&
2200            nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2201                goto nla_put_failure;
2202#endif
2203        if (!rt_is_input_route(rt) &&
2204            fl4->saddr != src) {
2205                if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2206                        goto nla_put_failure;
2207        }
2208        if (rt->rt_uses_gateway &&
2209            nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2210                goto nla_put_failure;
2211
2212        expires = rt->dst.expires;
2213        if (expires) {
2214                unsigned long now = jiffies;
2215
2216                if (time_before(now, expires))
2217                        expires -= now;
2218                else
2219                        expires = 0;
2220        }
2221
2222        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2223        if (rt->rt_pmtu && expires)
2224                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2225        if (rtnetlink_put_metrics(skb, metrics) < 0)
2226                goto nla_put_failure;
2227
2228        if (fl4->flowi4_mark &&
2229            nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2230                goto nla_put_failure;
2231
2232        error = rt->dst.error;
2233
2234        if (rt_is_input_route(rt)) {
2235                if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2236                        goto nla_put_failure;
2237        }
2238
2239        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2240                goto nla_put_failure;
2241
2242        return nlmsg_end(skb, nlh);
2243
2244nla_put_failure:
2245        nlmsg_cancel(skb, nlh);
2246        return -EMSGSIZE;
2247}
2248
2249static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2250{
2251        struct net *net = sock_net(in_skb->sk);
2252        struct rtmsg *rtm;
2253        struct nlattr *tb[RTA_MAX+1];
2254        struct rtable *rt = NULL;
2255        struct flowi4 fl4;
2256        __be32 dst = 0;
2257        __be32 src = 0;
2258        u32 iif;
2259        int err;
2260        int mark;
2261        struct sk_buff *skb;
2262
2263        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2264        if (err < 0)
2265                goto errout;
2266
2267        rtm = nlmsg_data(nlh);
2268
2269        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2270        if (skb == NULL) {
2271                err = -ENOBUFS;
2272                goto errout;
2273        }
2274
2275        /* Reserve room for dummy headers, this skb can pass
2276           through good chunk of routing engine.
2277         */
2278        skb_reset_mac_header(skb);
2279        skb_reset_network_header(skb);
2280
2281        /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2282        ip_hdr(skb)->protocol = IPPROTO_ICMP;
2283        skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2284
2285        src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2286        dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2287        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2288        mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2289
2290        memset(&fl4, 0, sizeof(fl4));
2291        fl4.daddr = dst;
2292        fl4.saddr = src;
2293        fl4.flowi4_tos = rtm->rtm_tos;
2294        fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2295        fl4.flowi4_mark = mark;
2296
2297        if (iif) {
2298                struct net_device *dev;
2299
2300                dev = __dev_get_by_index(net, iif);
2301                if (dev == NULL) {
2302                        err = -ENODEV;
2303                        goto errout_free;
2304                }
2305
2306                skb->protocol   = htons(ETH_P_IP);
2307                skb->dev        = dev;
2308                skb->mark       = mark;
2309                local_bh_disable();
2310                err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2311                local_bh_enable();
2312
2313                rt = skb_rtable(skb);
2314                if (err == 0 && rt->dst.error)
2315                        err = -rt->dst.error;
2316        } else {
2317                rt = ip_route_output_key(net, &fl4);
2318
2319                err = 0;
2320                if (IS_ERR(rt))
2321                        err = PTR_ERR(rt);
2322        }
2323
2324        if (err)
2325                goto errout_free;
2326
2327        skb_dst_set(skb, &rt->dst);
2328        if (rtm->rtm_flags & RTM_F_NOTIFY)
2329                rt->rt_flags |= RTCF_NOTIFY;
2330
2331        err = rt_fill_info(net, dst, src, &fl4, skb,
2332                           NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2333                           RTM_NEWROUTE, 0, 0);
2334        if (err <= 0)
2335                goto errout_free;
2336
2337        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2338errout:
2339        return err;
2340
2341errout_free:
2342        kfree_skb(skb);
2343        goto errout;
2344}
2345
2346int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2347{
2348        return skb->len;
2349}
2350
2351void ip_rt_multicast_event(struct in_device *in_dev)
2352{
2353        rt_cache_flush(dev_net(in_dev->dev));
2354}
2355
2356#ifdef CONFIG_SYSCTL
2357static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2358                                        void __user *buffer,
2359                                        size_t *lenp, loff_t *ppos)
2360{
2361        if (write) {
2362                rt_cache_flush((struct net *)__ctl->extra1);
2363                return 0;
2364        }
2365
2366        return -EINVAL;
2367}
2368
2369static ctl_table ipv4_route_table[] = {
2370        {
2371                .procname       = "gc_thresh",
2372                .data           = &ipv4_dst_ops.gc_thresh,
2373                .maxlen         = sizeof(int),
2374                .mode           = 0644,
2375                .proc_handler   = proc_dointvec,
2376        },
2377        {
2378                .procname       = "max_size",
2379                .data           = &ip_rt_max_size,
2380                .maxlen         = sizeof(int),
2381                .mode           = 0644,
2382                .proc_handler   = proc_dointvec,
2383        },
2384        {
2385                /*  Deprecated. Use gc_min_interval_ms */
2386
2387                .procname       = "gc_min_interval",
2388                .data           = &ip_rt_gc_min_interval,
2389                .maxlen         = sizeof(int),
2390                .mode           = 0644,
2391                .proc_handler   = proc_dointvec_jiffies,
2392        },
2393        {
2394                .procname       = "gc_min_interval_ms",
2395                .data           = &ip_rt_gc_min_interval,
2396                .maxlen         = sizeof(int),
2397                .mode           = 0644,
2398                .proc_handler   = proc_dointvec_ms_jiffies,
2399        },
2400        {
2401                .procname       = "gc_timeout",
2402                .data           = &ip_rt_gc_timeout,
2403                .maxlen         = sizeof(int),
2404                .mode           = 0644,
2405                .proc_handler   = proc_dointvec_jiffies,
2406        },
2407        {
2408                .procname       = "gc_interval",
2409                .data           = &ip_rt_gc_interval,
2410                .maxlen         = sizeof(int),
2411                .mode           = 0644,
2412                .proc_handler   = proc_dointvec_jiffies,
2413        },
2414        {
2415                .procname       = "redirect_load",
2416                .data           = &ip_rt_redirect_load,
2417                .maxlen         = sizeof(int),
2418                .mode           = 0644,
2419                .proc_handler   = proc_dointvec,
2420        },
2421        {
2422                .procname       = "redirect_number",
2423                .data           = &ip_rt_redirect_number,
2424                .maxlen         = sizeof(int),
2425                .mode           = 0644,
2426                .proc_handler   = proc_dointvec,
2427        },
2428        {
2429                .procname       = "redirect_silence",
2430                .data           = &ip_rt_redirect_silence,
2431                .maxlen         = sizeof(int),
2432                .mode           = 0644,
2433                .proc_handler   = proc_dointvec,
2434        },
2435        {
2436                .procname       = "error_cost",
2437                .data           = &ip_rt_error_cost,
2438                .maxlen         = sizeof(int),
2439                .mode           = 0644,
2440                .proc_handler   = proc_dointvec,
2441        },
2442        {
2443                .procname       = "error_burst",
2444                .data           = &ip_rt_error_burst,
2445                .maxlen         = sizeof(int),
2446                .mode           = 0644,
2447                .proc_handler   = proc_dointvec,
2448        },
2449        {
2450                .procname       = "gc_elasticity",
2451                .data           = &ip_rt_gc_elasticity,
2452                .maxlen         = sizeof(int),
2453                .mode           = 0644,
2454                .proc_handler   = proc_dointvec,
2455        },
2456        {
2457                .procname       = "mtu_expires",
2458                .data           = &ip_rt_mtu_expires,
2459                .maxlen         = sizeof(int),
2460                .mode           = 0644,
2461                .proc_handler   = proc_dointvec_jiffies,
2462        },
2463        {
2464                .procname       = "min_pmtu",
2465                .data           = &ip_rt_min_pmtu,
2466                .maxlen         = sizeof(int),
2467                .mode           = 0644,
2468                .proc_handler   = proc_dointvec,
2469        },
2470        {
2471                .procname       = "min_adv_mss",
2472                .data           = &ip_rt_min_advmss,
2473                .maxlen         = sizeof(int),
2474                .mode           = 0644,
2475                .proc_handler   = proc_dointvec,
2476        },
2477        { }
2478};
2479
2480static struct ctl_table ipv4_route_flush_table[] = {
2481        {
2482                .procname       = "flush",
2483                .maxlen         = sizeof(int),
2484                .mode           = 0200,
2485                .proc_handler   = ipv4_sysctl_rtcache_flush,
2486        },
2487        { },
2488};
2489
2490static __net_init int sysctl_route_net_init(struct net *net)
2491{
2492        struct ctl_table *tbl;
2493
2494        tbl = ipv4_route_flush_table;
2495        if (!net_eq(net, &init_net)) {
2496                tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2497                if (tbl == NULL)
2498                        goto err_dup;
2499        }
2500        tbl[0].extra1 = net;
2501
2502        net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2503        if (net->ipv4.route_hdr == NULL)
2504                goto err_reg;
2505        return 0;
2506
2507err_reg:
2508        if (tbl != ipv4_route_flush_table)
2509                kfree(tbl);
2510err_dup:
2511        return -ENOMEM;
2512}
2513
2514static __net_exit void sysctl_route_net_exit(struct net *net)
2515{
2516        struct ctl_table *tbl;
2517
2518        tbl = net->ipv4.route_hdr->ctl_table_arg;
2519        unregister_net_sysctl_table(net->ipv4.route_hdr);
2520        BUG_ON(tbl == ipv4_route_flush_table);
2521        kfree(tbl);
2522}
2523
2524static __net_initdata struct pernet_operations sysctl_route_ops = {
2525        .init = sysctl_route_net_init,
2526        .exit = sysctl_route_net_exit,
2527};
2528#endif
2529
2530static __net_init int rt_genid_init(struct net *net)
2531{
2532        atomic_set(&net->rt_genid, 0);
2533        get_random_bytes(&net->ipv4.dev_addr_genid,
2534                         sizeof(net->ipv4.dev_addr_genid));
2535        return 0;
2536}
2537
2538static __net_initdata struct pernet_operations rt_genid_ops = {
2539        .init = rt_genid_init,
2540};
2541
2542static int __net_init ipv4_inetpeer_init(struct net *net)
2543{
2544        struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2545
2546        if (!bp)
2547                return -ENOMEM;
2548        inet_peer_base_init(bp);
2549        net->ipv4.peers = bp;
2550        return 0;
2551}
2552
2553static void __net_exit ipv4_inetpeer_exit(struct net *net)
2554{
2555        struct inet_peer_base *bp = net->ipv4.peers;
2556
2557        net->ipv4.peers = NULL;
2558        inetpeer_invalidate_tree(bp);
2559        kfree(bp);
2560}
2561
2562static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2563        .init   =       ipv4_inetpeer_init,
2564        .exit   =       ipv4_inetpeer_exit,
2565};
2566
2567#ifdef CONFIG_IP_ROUTE_CLASSID
2568struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2569#endif /* CONFIG_IP_ROUTE_CLASSID */
2570
2571int __init ip_rt_init(void)
2572{
2573        int rc = 0;
2574
2575#ifdef CONFIG_IP_ROUTE_CLASSID
2576        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2577        if (!ip_rt_acct)
2578                panic("IP: failed to allocate ip_rt_acct\n");
2579#endif
2580
2581        ipv4_dst_ops.kmem_cachep =
2582                kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2583                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2584
2585        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2586
2587        if (dst_entries_init(&ipv4_dst_ops) < 0)
2588                panic("IP: failed to allocate ipv4_dst_ops counter\n");
2589
2590        if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2591                panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2592
2593        ipv4_dst_ops.gc_thresh = ~0;
2594        ip_rt_max_size = INT_MAX;
2595
2596        devinet_init();
2597        ip_fib_init();
2598
2599        if (ip_rt_proc_init())
2600                pr_err("Unable to create route proc files\n");
2601#ifdef CONFIG_XFRM
2602        xfrm_init();
2603        xfrm4_init();
2604#endif
2605        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2606
2607#ifdef CONFIG_SYSCTL
2608        register_pernet_subsys(&sysctl_route_ops);
2609#endif
2610        register_pernet_subsys(&rt_genid_ops);
2611        register_pernet_subsys(&ipv4_inetpeer_ops);
2612        return rc;
2613}
2614
2615#ifdef CONFIG_SYSCTL
2616/*
2617 * We really need to sanitize the damn ipv4 init order, then all
2618 * this nonsense will go away.
2619 */
2620void __init ip_static_sysctl_init(void)
2621{
2622        register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2623}
2624#endif
2625
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.