1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65#define pr_fmt(fmt) "IPv4: " fmt
66
67#include <linux/module.h>
68#include <asm/uaccess.h>
69#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
72#include <linux/mm.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/skbuff.h>
83#include <linux/inetdevice.h>
84#include <linux/igmp.h>
85#include <linux/pkt_sched.h>
86#include <linux/mroute.h>
87#include <linux/netfilter_ipv4.h>
88#include <linux/random.h>
89#include <linux/rcupdate.h>
90#include <linux/times.h>
91#include <linux/slab.h>
92#include <net/dst.h>
93#include <net/net_namespace.h>
94#include <net/protocol.h>
95#include <net/ip.h>
96#include <net/route.h>
97#include <net/inetpeer.h>
98#include <net/sock.h>
99#include <net/ip_fib.h>
100#include <net/arp.h>
101#include <net/tcp.h>
102#include <net/icmp.h>
103#include <net/xfrm.h>
104#include <net/netevent.h>
105#include <net/rtnetlink.h>
106#ifdef CONFIG_SYSCTL
107#include <linux/sysctl.h>
108#include <linux/kmemleak.h>
109#endif
110#include <net/secure_seq.h>
111
112#define RT_FL_TOS(oldflp4) \
113 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114
115#define IP_MAX_MTU 0xFFF0
116
117#define RT_GC_TIMEOUT (300*HZ)
118
119static int ip_rt_max_size;
120static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
121static int ip_rt_gc_interval __read_mostly = 60 * HZ;
122static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
123static int ip_rt_redirect_number __read_mostly = 9;
124static int ip_rt_redirect_load __read_mostly = HZ / 50;
125static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126static int ip_rt_error_cost __read_mostly = HZ;
127static int ip_rt_error_burst __read_mostly = 5 * HZ;
128static int ip_rt_gc_elasticity __read_mostly = 8;
129static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
130static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131static int ip_rt_min_advmss __read_mostly = 256;
132
133
134
135
136
137static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
139static unsigned int ipv4_mtu(const struct dst_entry *dst);
140static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141static void ipv4_link_failure(struct sk_buff *skb);
142static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143 struct sk_buff *skb, u32 mtu);
144static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145 struct sk_buff *skb);
146static void ipv4_dst_destroy(struct dst_entry *dst);
147
148static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
149 int how)
150{
151}
152
153static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154{
155 WARN_ON(1);
156 return NULL;
157}
158
159static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
160 struct sk_buff *skb,
161 const void *daddr);
162
163static struct dst_ops ipv4_dst_ops = {
164 .family = AF_INET,
165 .protocol = cpu_to_be16(ETH_P_IP),
166 .check = ipv4_dst_check,
167 .default_advmss = ipv4_default_advmss,
168 .mtu = ipv4_mtu,
169 .cow_metrics = ipv4_cow_metrics,
170 .destroy = ipv4_dst_destroy,
171 .ifdown = ipv4_dst_ifdown,
172 .negative_advice = ipv4_negative_advice,
173 .link_failure = ipv4_link_failure,
174 .update_pmtu = ip_rt_update_pmtu,
175 .redirect = ip_do_redirect,
176 .local_out = __ip_local_out,
177 .neigh_lookup = ipv4_neigh_lookup,
178};
179
180#define ECN_OR_COST(class) TC_PRIO_##class
181
182const __u8 ip_tos2prio[16] = {
183 TC_PRIO_BESTEFFORT,
184 ECN_OR_COST(BESTEFFORT),
185 TC_PRIO_BESTEFFORT,
186 ECN_OR_COST(BESTEFFORT),
187 TC_PRIO_BULK,
188 ECN_OR_COST(BULK),
189 TC_PRIO_BULK,
190 ECN_OR_COST(BULK),
191 TC_PRIO_INTERACTIVE,
192 ECN_OR_COST(INTERACTIVE),
193 TC_PRIO_INTERACTIVE,
194 ECN_OR_COST(INTERACTIVE),
195 TC_PRIO_INTERACTIVE_BULK,
196 ECN_OR_COST(INTERACTIVE_BULK),
197 TC_PRIO_INTERACTIVE_BULK,
198 ECN_OR_COST(INTERACTIVE_BULK)
199};
200EXPORT_SYMBOL(ip_tos2prio);
201
202static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
203#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
204
205#ifdef CONFIG_PROC_FS
206static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
207{
208 if (*pos)
209 return NULL;
210 return SEQ_START_TOKEN;
211}
212
213static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
214{
215 ++*pos;
216 return NULL;
217}
218
219static void rt_cache_seq_stop(struct seq_file *seq, void *v)
220{
221}
222
223static int rt_cache_seq_show(struct seq_file *seq, void *v)
224{
225 if (v == SEQ_START_TOKEN)
226 seq_printf(seq, "%-127s\n",
227 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
228 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
229 "HHUptod\tSpecDst");
230 return 0;
231}
232
233static const struct seq_operations rt_cache_seq_ops = {
234 .start = rt_cache_seq_start,
235 .next = rt_cache_seq_next,
236 .stop = rt_cache_seq_stop,
237 .show = rt_cache_seq_show,
238};
239
240static int rt_cache_seq_open(struct inode *inode, struct file *file)
241{
242 return seq_open(file, &rt_cache_seq_ops);
243}
244
245static const struct file_operations rt_cache_seq_fops = {
246 .owner = THIS_MODULE,
247 .open = rt_cache_seq_open,
248 .read = seq_read,
249 .llseek = seq_lseek,
250 .release = seq_release,
251};
252
253
254static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
255{
256 int cpu;
257
258 if (*pos == 0)
259 return SEQ_START_TOKEN;
260
261 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
262 if (!cpu_possible(cpu))
263 continue;
264 *pos = cpu+1;
265 return &per_cpu(rt_cache_stat, cpu);
266 }
267 return NULL;
268}
269
270static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
271{
272 int cpu;
273
274 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
275 if (!cpu_possible(cpu))
276 continue;
277 *pos = cpu+1;
278 return &per_cpu(rt_cache_stat, cpu);
279 }
280 return NULL;
281
282}
283
284static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
285{
286
287}
288
289static int rt_cpu_seq_show(struct seq_file *seq, void *v)
290{
291 struct rt_cache_stat *st = v;
292
293 if (v == SEQ_START_TOKEN) {
294 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
295 return 0;
296 }
297
298 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
299 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
300 dst_entries_get_slow(&ipv4_dst_ops),
301 st->in_hit,
302 st->in_slow_tot,
303 st->in_slow_mc,
304 st->in_no_route,
305 st->in_brd,
306 st->in_martian_dst,
307 st->in_martian_src,
308
309 st->out_hit,
310 st->out_slow_tot,
311 st->out_slow_mc,
312
313 st->gc_total,
314 st->gc_ignored,
315 st->gc_goal_miss,
316 st->gc_dst_overflow,
317 st->in_hlist_search,
318 st->out_hlist_search
319 );
320 return 0;
321}
322
323static const struct seq_operations rt_cpu_seq_ops = {
324 .start = rt_cpu_seq_start,
325 .next = rt_cpu_seq_next,
326 .stop = rt_cpu_seq_stop,
327 .show = rt_cpu_seq_show,
328};
329
330
331static int rt_cpu_seq_open(struct inode *inode, struct file *file)
332{
333 return seq_open(file, &rt_cpu_seq_ops);
334}
335
336static const struct file_operations rt_cpu_seq_fops = {
337 .owner = THIS_MODULE,
338 .open = rt_cpu_seq_open,
339 .read = seq_read,
340 .llseek = seq_lseek,
341 .release = seq_release,
342};
343
344#ifdef CONFIG_IP_ROUTE_CLASSID
345static int rt_acct_proc_show(struct seq_file *m, void *v)
346{
347 struct ip_rt_acct *dst, *src;
348 unsigned int i, j;
349
350 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
351 if (!dst)
352 return -ENOMEM;
353
354 for_each_possible_cpu(i) {
355 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
356 for (j = 0; j < 256; j++) {
357 dst[j].o_bytes += src[j].o_bytes;
358 dst[j].o_packets += src[j].o_packets;
359 dst[j].i_bytes += src[j].i_bytes;
360 dst[j].i_packets += src[j].i_packets;
361 }
362 }
363
364 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
365 kfree(dst);
366 return 0;
367}
368
369static int rt_acct_proc_open(struct inode *inode, struct file *file)
370{
371 return single_open(file, rt_acct_proc_show, NULL);
372}
373
374static const struct file_operations rt_acct_proc_fops = {
375 .owner = THIS_MODULE,
376 .open = rt_acct_proc_open,
377 .read = seq_read,
378 .llseek = seq_lseek,
379 .release = single_release,
380};
381#endif
382
383static int __net_init ip_rt_do_proc_init(struct net *net)
384{
385 struct proc_dir_entry *pde;
386
387 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
388 &rt_cache_seq_fops);
389 if (!pde)
390 goto err1;
391
392 pde = proc_create("rt_cache", S_IRUGO,
393 net->proc_net_stat, &rt_cpu_seq_fops);
394 if (!pde)
395 goto err2;
396
397#ifdef CONFIG_IP_ROUTE_CLASSID
398 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
399 if (!pde)
400 goto err3;
401#endif
402 return 0;
403
404#ifdef CONFIG_IP_ROUTE_CLASSID
405err3:
406 remove_proc_entry("rt_cache", net->proc_net_stat);
407#endif
408err2:
409 remove_proc_entry("rt_cache", net->proc_net);
410err1:
411 return -ENOMEM;
412}
413
414static void __net_exit ip_rt_do_proc_exit(struct net *net)
415{
416 remove_proc_entry("rt_cache", net->proc_net_stat);
417 remove_proc_entry("rt_cache", net->proc_net);
418#ifdef CONFIG_IP_ROUTE_CLASSID
419 remove_proc_entry("rt_acct", net->proc_net);
420#endif
421}
422
423static struct pernet_operations ip_rt_proc_ops __net_initdata = {
424 .init = ip_rt_do_proc_init,
425 .exit = ip_rt_do_proc_exit,
426};
427
428static int __init ip_rt_proc_init(void)
429{
430 return register_pernet_subsys(&ip_rt_proc_ops);
431}
432
433#else
434static inline int ip_rt_proc_init(void)
435{
436 return 0;
437}
438#endif
439
440static inline bool rt_is_expired(const struct rtable *rth)
441{
442 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
443}
444
445void rt_cache_flush(struct net *net)
446{
447 rt_genid_bump(net);
448}
449
450static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
451 struct sk_buff *skb,
452 const void *daddr)
453{
454 struct net_device *dev = dst->dev;
455 const __be32 *pkey = daddr;
456 const struct rtable *rt;
457 struct neighbour *n;
458
459 rt = (const struct rtable *) dst;
460 if (rt->rt_gateway)
461 pkey = (const __be32 *) &rt->rt_gateway;
462 else if (skb)
463 pkey = &ip_hdr(skb)->daddr;
464
465 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
466 if (n)
467 return n;
468 return neigh_create(&arp_tbl, pkey, dev);
469}
470
471
472
473
474
475
476
477
478static void ip_select_fb_ident(struct iphdr *iph)
479{
480 static DEFINE_SPINLOCK(ip_fb_id_lock);
481 static u32 ip_fallback_id;
482 u32 salt;
483
484 spin_lock_bh(&ip_fb_id_lock);
485 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
486 iph->id = htons(salt & 0xFFFF);
487 ip_fallback_id = salt;
488 spin_unlock_bh(&ip_fb_id_lock);
489}
490
491void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
492{
493 struct net *net = dev_net(dst->dev);
494 struct inet_peer *peer;
495
496 peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
497 if (peer) {
498 iph->id = htons(inet_getid(peer, more));
499 inet_putpeer(peer);
500 return;
501 }
502
503 ip_select_fb_ident(iph);
504}
505EXPORT_SYMBOL(__ip_select_ident);
506
507static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
508 const struct iphdr *iph,
509 int oif, u8 tos,
510 u8 prot, u32 mark, int flow_flags)
511{
512 if (sk) {
513 const struct inet_sock *inet = inet_sk(sk);
514
515 oif = sk->sk_bound_dev_if;
516 mark = sk->sk_mark;
517 tos = RT_CONN_FLAGS(sk);
518 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
519 }
520 flowi4_init_output(fl4, oif, mark, tos,
521 RT_SCOPE_UNIVERSE, prot,
522 flow_flags,
523 iph->daddr, iph->saddr, 0, 0);
524}
525
526static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
527 const struct sock *sk)
528{
529 const struct iphdr *iph = ip_hdr(skb);
530 int oif = skb->dev->ifindex;
531 u8 tos = RT_TOS(iph->tos);
532 u8 prot = iph->protocol;
533 u32 mark = skb->mark;
534
535 __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
536}
537
538static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
539{
540 const struct inet_sock *inet = inet_sk(sk);
541 const struct ip_options_rcu *inet_opt;
542 __be32 daddr = inet->inet_daddr;
543
544 rcu_read_lock();
545 inet_opt = rcu_dereference(inet->inet_opt);
546 if (inet_opt && inet_opt->opt.srr)
547 daddr = inet_opt->opt.faddr;
548 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
549 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
550 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
551 inet_sk_flowi_flags(sk),
552 daddr, inet->inet_saddr, 0, 0);
553 rcu_read_unlock();
554}
555
556static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
557 const struct sk_buff *skb)
558{
559 if (skb)
560 build_skb_flow_key(fl4, skb, sk);
561 else
562 build_sk_flow_key(fl4, sk);
563}
564
565static inline void rt_free(struct rtable *rt)
566{
567 call_rcu(&rt->dst.rcu_head, dst_rcu_free);
568}
569
570static DEFINE_SPINLOCK(fnhe_lock);
571
572static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
573{
574 struct fib_nh_exception *fnhe, *oldest;
575 struct rtable *orig;
576
577 oldest = rcu_dereference(hash->chain);
578 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
579 fnhe = rcu_dereference(fnhe->fnhe_next)) {
580 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
581 oldest = fnhe;
582 }
583 orig = rcu_dereference(oldest->fnhe_rth);
584 if (orig) {
585 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
586 rt_free(orig);
587 }
588 return oldest;
589}
590
591static inline u32 fnhe_hashfun(__be32 daddr)
592{
593 u32 hval;
594
595 hval = (__force u32) daddr;
596 hval ^= (hval >> 11) ^ (hval >> 22);
597
598 return hval & (FNHE_HASH_SIZE - 1);
599}
600
601static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
602 u32 pmtu, unsigned long expires)
603{
604 struct fnhe_hash_bucket *hash;
605 struct fib_nh_exception *fnhe;
606 int depth;
607 u32 hval = fnhe_hashfun(daddr);
608
609 spin_lock_bh(&fnhe_lock);
610
611 hash = nh->nh_exceptions;
612 if (!hash) {
613 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
614 if (!hash)
615 goto out_unlock;
616 nh->nh_exceptions = hash;
617 }
618
619 hash += hval;
620
621 depth = 0;
622 for (fnhe = rcu_dereference(hash->chain); fnhe;
623 fnhe = rcu_dereference(fnhe->fnhe_next)) {
624 if (fnhe->fnhe_daddr == daddr)
625 break;
626 depth++;
627 }
628
629 if (fnhe) {
630 if (gw)
631 fnhe->fnhe_gw = gw;
632 if (pmtu) {
633 fnhe->fnhe_pmtu = pmtu;
634 fnhe->fnhe_expires = expires;
635 }
636 } else {
637 if (depth > FNHE_RECLAIM_DEPTH)
638 fnhe = fnhe_oldest(hash);
639 else {
640 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
641 if (!fnhe)
642 goto out_unlock;
643
644 fnhe->fnhe_next = hash->chain;
645 rcu_assign_pointer(hash->chain, fnhe);
646 }
647 fnhe->fnhe_daddr = daddr;
648 fnhe->fnhe_gw = gw;
649 fnhe->fnhe_pmtu = pmtu;
650 fnhe->fnhe_expires = expires;
651 }
652
653 fnhe->fnhe_stamp = jiffies;
654
655out_unlock:
656 spin_unlock_bh(&fnhe_lock);
657 return;
658}
659
660static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
661 bool kill_route)
662{
663 __be32 new_gw = icmp_hdr(skb)->un.gateway;
664 __be32 old_gw = ip_hdr(skb)->saddr;
665 struct net_device *dev = skb->dev;
666 struct in_device *in_dev;
667 struct fib_result res;
668 struct neighbour *n;
669 struct net *net;
670
671 switch (icmp_hdr(skb)->code & 7) {
672 case ICMP_REDIR_NET:
673 case ICMP_REDIR_NETTOS:
674 case ICMP_REDIR_HOST:
675 case ICMP_REDIR_HOSTTOS:
676 break;
677
678 default:
679 return;
680 }
681
682 if (rt->rt_gateway != old_gw)
683 return;
684
685 in_dev = __in_dev_get_rcu(dev);
686 if (!in_dev)
687 return;
688
689 net = dev_net(dev);
690 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
691 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
692 ipv4_is_zeronet(new_gw))
693 goto reject_redirect;
694
695 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
696 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
697 goto reject_redirect;
698 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
699 goto reject_redirect;
700 } else {
701 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
702 goto reject_redirect;
703 }
704
705 n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
706 if (n) {
707 if (!(n->nud_state & NUD_VALID)) {
708 neigh_event_send(n, NULL);
709 } else {
710 if (fib_lookup(net, fl4, &res) == 0) {
711 struct fib_nh *nh = &FIB_RES_NH(res);
712
713 update_or_create_fnhe(nh, fl4->daddr, new_gw,
714 0, 0);
715 }
716 if (kill_route)
717 rt->dst.obsolete = DST_OBSOLETE_KILL;
718 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
719 }
720 neigh_release(n);
721 }
722 return;
723
724reject_redirect:
725#ifdef CONFIG_IP_ROUTE_VERBOSE
726 if (IN_DEV_LOG_MARTIANS(in_dev)) {
727 const struct iphdr *iph = (const struct iphdr *) skb->data;
728 __be32 daddr = iph->daddr;
729 __be32 saddr = iph->saddr;
730
731 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
732 " Advised path = %pI4 -> %pI4\n",
733 &old_gw, dev->name, &new_gw,
734 &saddr, &daddr);
735 }
736#endif
737 ;
738}
739
740static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
741{
742 struct rtable *rt;
743 struct flowi4 fl4;
744
745 rt = (struct rtable *) dst;
746
747 ip_rt_build_flow_key(&fl4, sk, skb);
748 __ip_do_redirect(rt, skb, &fl4, true);
749}
750
751static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
752{
753 struct rtable *rt = (struct rtable *)dst;
754 struct dst_entry *ret = dst;
755
756 if (rt) {
757 if (dst->obsolete > 0) {
758 ip_rt_put(rt);
759 ret = NULL;
760 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
761 rt->dst.expires) {
762 ip_rt_put(rt);
763 ret = NULL;
764 }
765 }
766 return ret;
767}
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785void ip_rt_send_redirect(struct sk_buff *skb)
786{
787 struct rtable *rt = skb_rtable(skb);
788 struct in_device *in_dev;
789 struct inet_peer *peer;
790 struct net *net;
791 int log_martians;
792
793 rcu_read_lock();
794 in_dev = __in_dev_get_rcu(rt->dst.dev);
795 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
796 rcu_read_unlock();
797 return;
798 }
799 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
800 rcu_read_unlock();
801
802 net = dev_net(rt->dst.dev);
803 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
804 if (!peer) {
805 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
806 rt_nexthop(rt, ip_hdr(skb)->daddr));
807 return;
808 }
809
810
811
812
813 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
814 peer->rate_tokens = 0;
815
816
817
818
819 if (peer->rate_tokens >= ip_rt_redirect_number) {
820 peer->rate_last = jiffies;
821 goto out_put_peer;
822 }
823
824
825
826
827 if (peer->rate_tokens == 0 ||
828 time_after(jiffies,
829 (peer->rate_last +
830 (ip_rt_redirect_load << peer->rate_tokens)))) {
831 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
832
833 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
834 peer->rate_last = jiffies;
835 ++peer->rate_tokens;
836#ifdef CONFIG_IP_ROUTE_VERBOSE
837 if (log_martians &&
838 peer->rate_tokens == ip_rt_redirect_number)
839 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
840 &ip_hdr(skb)->saddr, inet_iif(skb),
841 &ip_hdr(skb)->daddr, &gw);
842#endif
843 }
844out_put_peer:
845 inet_putpeer(peer);
846}
847
848static int ip_error(struct sk_buff *skb)
849{
850 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
851 struct rtable *rt = skb_rtable(skb);
852 struct inet_peer *peer;
853 unsigned long now;
854 struct net *net;
855 bool send;
856 int code;
857
858 net = dev_net(rt->dst.dev);
859 if (!IN_DEV_FORWARD(in_dev)) {
860 switch (rt->dst.error) {
861 case EHOSTUNREACH:
862 IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
863 break;
864
865 case ENETUNREACH:
866 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
867 break;
868 }
869 goto out;
870 }
871
872 switch (rt->dst.error) {
873 case EINVAL:
874 default:
875 goto out;
876 case EHOSTUNREACH:
877 code = ICMP_HOST_UNREACH;
878 break;
879 case ENETUNREACH:
880 code = ICMP_NET_UNREACH;
881 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
882 break;
883 case EACCES:
884 code = ICMP_PKT_FILTERED;
885 break;
886 }
887
888 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
889
890 send = true;
891 if (peer) {
892 now = jiffies;
893 peer->rate_tokens += now - peer->rate_last;
894 if (peer->rate_tokens > ip_rt_error_burst)
895 peer->rate_tokens = ip_rt_error_burst;
896 peer->rate_last = now;
897 if (peer->rate_tokens >= ip_rt_error_cost)
898 peer->rate_tokens -= ip_rt_error_cost;
899 else
900 send = false;
901 inet_putpeer(peer);
902 }
903 if (send)
904 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
905
906out: kfree_skb(skb);
907 return 0;
908}
909
910static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
911{
912 struct dst_entry *dst = &rt->dst;
913 struct fib_result res;
914
915 if (dst->dev->mtu < mtu)
916 return;
917
918 if (mtu < ip_rt_min_pmtu)
919 mtu = ip_rt_min_pmtu;
920
921 if (!rt->rt_pmtu) {
922 dst->obsolete = DST_OBSOLETE_KILL;
923 } else {
924 rt->rt_pmtu = mtu;
925 dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
926 }
927
928 rcu_read_lock();
929 if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
930 struct fib_nh *nh = &FIB_RES_NH(res);
931
932 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
933 jiffies + ip_rt_mtu_expires);
934 }
935 rcu_read_unlock();
936}
937
938static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
939 struct sk_buff *skb, u32 mtu)
940{
941 struct rtable *rt = (struct rtable *) dst;
942 struct flowi4 fl4;
943
944 ip_rt_build_flow_key(&fl4, sk, skb);
945 __ip_rt_update_pmtu(rt, &fl4, mtu);
946}
947
948void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
949 int oif, u32 mark, u8 protocol, int flow_flags)
950{
951 const struct iphdr *iph = (const struct iphdr *) skb->data;
952 struct flowi4 fl4;
953 struct rtable *rt;
954
955 __build_flow_key(&fl4, NULL, iph, oif,
956 RT_TOS(iph->tos), protocol, mark, flow_flags);
957 rt = __ip_route_output_key(net, &fl4);
958 if (!IS_ERR(rt)) {
959 __ip_rt_update_pmtu(rt, &fl4, mtu);
960 ip_rt_put(rt);
961 }
962}
963EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
964
965void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
966{
967 const struct iphdr *iph = (const struct iphdr *) skb->data;
968 struct flowi4 fl4;
969 struct rtable *rt;
970
971 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
972 rt = __ip_route_output_key(sock_net(sk), &fl4);
973 if (!IS_ERR(rt)) {
974 __ip_rt_update_pmtu(rt, &fl4, mtu);
975 ip_rt_put(rt);
976 }
977}
978EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
979
980void ipv4_redirect(struct sk_buff *skb, struct net *net,
981 int oif, u32 mark, u8 protocol, int flow_flags)
982{
983 const struct iphdr *iph = (const struct iphdr *) skb->data;
984 struct flowi4 fl4;
985 struct rtable *rt;
986
987 __build_flow_key(&fl4, NULL, iph, oif,
988 RT_TOS(iph->tos), protocol, mark, flow_flags);
989 rt = __ip_route_output_key(net, &fl4);
990 if (!IS_ERR(rt)) {
991 __ip_do_redirect(rt, skb, &fl4, false);
992 ip_rt_put(rt);
993 }
994}
995EXPORT_SYMBOL_GPL(ipv4_redirect);
996
997void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
998{
999 const struct iphdr *iph = (const struct iphdr *) skb->data;
1000 struct flowi4 fl4;
1001 struct rtable *rt;
1002
1003 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1004 rt = __ip_route_output_key(sock_net(sk), &fl4);
1005 if (!IS_ERR(rt)) {
1006 __ip_do_redirect(rt, skb, &fl4, false);
1007 ip_rt_put(rt);
1008 }
1009}
1010EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1011
1012static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1013{
1014 struct rtable *rt = (struct rtable *) dst;
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024 if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1025 return NULL;
1026 return dst;
1027}
1028
1029static void ipv4_link_failure(struct sk_buff *skb)
1030{
1031 struct rtable *rt;
1032
1033 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1034
1035 rt = skb_rtable(skb);
1036 if (rt)
1037 dst_set_expires(&rt->dst, 0);
1038}
1039
1040static int ip_rt_bug(struct sk_buff *skb)
1041{
1042 pr_debug("%s: %pI4 -> %pI4, %s\n",
1043 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1044 skb->dev ? skb->dev->name : "?");
1045 kfree_skb(skb);
1046 WARN_ON(1);
1047 return 0;
1048}
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1060{
1061 __be32 src;
1062
1063 if (rt_is_output_route(rt))
1064 src = ip_hdr(skb)->saddr;
1065 else {
1066 struct fib_result res;
1067 struct flowi4 fl4;
1068 struct iphdr *iph;
1069
1070 iph = ip_hdr(skb);
1071
1072 memset(&fl4, 0, sizeof(fl4));
1073 fl4.daddr = iph->daddr;
1074 fl4.saddr = iph->saddr;
1075 fl4.flowi4_tos = RT_TOS(iph->tos);
1076 fl4.flowi4_oif = rt->dst.dev->ifindex;
1077 fl4.flowi4_iif = skb->dev->ifindex;
1078 fl4.flowi4_mark = skb->mark;
1079
1080 rcu_read_lock();
1081 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1082 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1083 else
1084 src = inet_select_addr(rt->dst.dev,
1085 rt_nexthop(rt, iph->daddr),
1086 RT_SCOPE_UNIVERSE);
1087 rcu_read_unlock();
1088 }
1089 memcpy(addr, &src, 4);
1090}
1091
1092#ifdef CONFIG_IP_ROUTE_CLASSID
1093static void set_class_tag(struct rtable *rt, u32 tag)
1094{
1095 if (!(rt->dst.tclassid & 0xFFFF))
1096 rt->dst.tclassid |= tag & 0xFFFF;
1097 if (!(rt->dst.tclassid & 0xFFFF0000))
1098 rt->dst.tclassid |= tag & 0xFFFF0000;
1099}
1100#endif
1101
1102static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1103{
1104 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1105
1106 if (advmss == 0) {
1107 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1108 ip_rt_min_advmss);
1109 if (advmss > 65535 - 40)
1110 advmss = 65535 - 40;
1111 }
1112 return advmss;
1113}
1114
1115static unsigned int ipv4_mtu(const struct dst_entry *dst)
1116{
1117 const struct rtable *rt = (const struct rtable *) dst;
1118 unsigned int mtu = rt->rt_pmtu;
1119
1120 if (mtu && time_after_eq(jiffies, rt->dst.expires))
1121 mtu = 0;
1122
1123 if (!mtu)
1124 mtu = dst_metric_raw(dst, RTAX_MTU);
1125
1126 if (mtu && rt_is_output_route(rt))
1127 return mtu;
1128
1129 mtu = dst->dev->mtu;
1130
1131 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1132 if (rt->rt_uses_gateway && mtu > 576)
1133 mtu = 576;
1134 }
1135
1136 if (mtu > IP_MAX_MTU)
1137 mtu = IP_MAX_MTU;
1138
1139 return mtu;
1140}
1141
1142static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1143{
1144 struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1145 struct fib_nh_exception *fnhe;
1146 u32 hval;
1147
1148 if (!hash)
1149 return NULL;
1150
1151 hval = fnhe_hashfun(daddr);
1152
1153 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1154 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1155 if (fnhe->fnhe_daddr == daddr)
1156 return fnhe;
1157 }
1158 return NULL;
1159}
1160
1161static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1162 __be32 daddr)
1163{
1164 bool ret = false;
1165
1166 spin_lock_bh(&fnhe_lock);
1167
1168 if (daddr == fnhe->fnhe_daddr) {
1169 struct rtable *orig;
1170
1171 if (fnhe->fnhe_pmtu) {
1172 unsigned long expires = fnhe->fnhe_expires;
1173 unsigned long diff = expires - jiffies;
1174
1175 if (time_before(jiffies, expires)) {
1176 rt->rt_pmtu = fnhe->fnhe_pmtu;
1177 dst_set_expires(&rt->dst, diff);
1178 }
1179 }
1180 if (fnhe->fnhe_gw) {
1181 rt->rt_flags |= RTCF_REDIRECTED;
1182 rt->rt_gateway = fnhe->fnhe_gw;
1183 rt->rt_uses_gateway = 1;
1184 } else if (!rt->rt_gateway)
1185 rt->rt_gateway = daddr;
1186
1187 orig = rcu_dereference(fnhe->fnhe_rth);
1188 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1189 if (orig)
1190 rt_free(orig);
1191
1192 fnhe->fnhe_stamp = jiffies;
1193 ret = true;
1194 }
1195 spin_unlock_bh(&fnhe_lock);
1196
1197 return ret;
1198}
1199
1200static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1201{
1202 struct rtable *orig, *prev, **p;
1203 bool ret = true;
1204
1205 if (rt_is_input_route(rt)) {
1206 p = (struct rtable **)&nh->nh_rth_input;
1207 } else {
1208 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1209 }
1210 orig = *p;
1211
1212 prev = cmpxchg(p, orig, rt);
1213 if (prev == orig) {
1214 if (orig)
1215 rt_free(orig);
1216 } else
1217 ret = false;
1218
1219 return ret;
1220}
1221
1222static DEFINE_SPINLOCK(rt_uncached_lock);
1223static LIST_HEAD(rt_uncached_list);
1224
1225static void rt_add_uncached_list(struct rtable *rt)
1226{
1227 spin_lock_bh(&rt_uncached_lock);
1228 list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1229 spin_unlock_bh(&rt_uncached_lock);
1230}
1231
1232static void ipv4_dst_destroy(struct dst_entry *dst)
1233{
1234 struct rtable *rt = (struct rtable *) dst;
1235
1236 if (!list_empty(&rt->rt_uncached)) {
1237 spin_lock_bh(&rt_uncached_lock);
1238 list_del(&rt->rt_uncached);
1239 spin_unlock_bh(&rt_uncached_lock);
1240 }
1241}
1242
1243void rt_flush_dev(struct net_device *dev)
1244{
1245 if (!list_empty(&rt_uncached_list)) {
1246 struct net *net = dev_net(dev);
1247 struct rtable *rt;
1248
1249 spin_lock_bh(&rt_uncached_lock);
1250 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1251 if (rt->dst.dev != dev)
1252 continue;
1253 rt->dst.dev = net->loopback_dev;
1254 dev_hold(rt->dst.dev);
1255 dev_put(dev);
1256 }
1257 spin_unlock_bh(&rt_uncached_lock);
1258 }
1259}
1260
1261static bool rt_cache_valid(const struct rtable *rt)
1262{
1263 return rt &&
1264 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1265 !rt_is_expired(rt);
1266}
1267
1268static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1269 const struct fib_result *res,
1270 struct fib_nh_exception *fnhe,
1271 struct fib_info *fi, u16 type, u32 itag)
1272{
1273 bool cached = false;
1274
1275 if (fi) {
1276 struct fib_nh *nh = &FIB_RES_NH(*res);
1277
1278 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1279 rt->rt_gateway = nh->nh_gw;
1280 rt->rt_uses_gateway = 1;
1281 }
1282 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1283#ifdef CONFIG_IP_ROUTE_CLASSID
1284 rt->dst.tclassid = nh->nh_tclassid;
1285#endif
1286 if (unlikely(fnhe))
1287 cached = rt_bind_exception(rt, fnhe, daddr);
1288 else if (!(rt->dst.flags & DST_NOCACHE))
1289 cached = rt_cache_route(nh, rt);
1290 if (unlikely(!cached)) {
1291
1292
1293
1294
1295
1296 rt->dst.flags |= DST_NOCACHE;
1297 if (!rt->rt_gateway)
1298 rt->rt_gateway = daddr;
1299 rt_add_uncached_list(rt);
1300 }
1301 } else
1302 rt_add_uncached_list(rt);
1303
1304#ifdef CONFIG_IP_ROUTE_CLASSID
1305#ifdef CONFIG_IP_MULTIPLE_TABLES
1306 set_class_tag(rt, res->tclassid);
1307#endif
1308 set_class_tag(rt, itag);
1309#endif
1310}
1311
1312static struct rtable *rt_dst_alloc(struct net_device *dev,
1313 bool nopolicy, bool noxfrm, bool will_cache)
1314{
1315 return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1316 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1317 (nopolicy ? DST_NOPOLICY : 0) |
1318 (noxfrm ? DST_NOXFRM : 0));
1319}
1320
1321
1322static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1323 u8 tos, struct net_device *dev, int our)
1324{
1325 struct rtable *rth;
1326 struct in_device *in_dev = __in_dev_get_rcu(dev);
1327 u32 itag = 0;
1328 int err;
1329
1330
1331
1332 if (in_dev == NULL)
1333 return -EINVAL;
1334
1335 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1336 skb->protocol != htons(ETH_P_IP))
1337 goto e_inval;
1338
1339 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1340 if (ipv4_is_loopback(saddr))
1341 goto e_inval;
1342
1343 if (ipv4_is_zeronet(saddr)) {
1344 if (!ipv4_is_local_multicast(daddr))
1345 goto e_inval;
1346 } else {
1347 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1348 in_dev, &itag);
1349 if (err < 0)
1350 goto e_err;
1351 }
1352 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1353 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1354 if (!rth)
1355 goto e_nobufs;
1356
1357#ifdef CONFIG_IP_ROUTE_CLASSID
1358 rth->dst.tclassid = itag;
1359#endif
1360 rth->dst.output = ip_rt_bug;
1361
1362 rth->rt_genid = rt_genid(dev_net(dev));
1363 rth->rt_flags = RTCF_MULTICAST;
1364 rth->rt_type = RTN_MULTICAST;
1365 rth->rt_is_input= 1;
1366 rth->rt_iif = 0;
1367 rth->rt_pmtu = 0;
1368 rth->rt_gateway = 0;
1369 rth->rt_uses_gateway = 0;
1370 INIT_LIST_HEAD(&rth->rt_uncached);
1371 if (our) {
1372 rth->dst.input= ip_local_deliver;
1373 rth->rt_flags |= RTCF_LOCAL;
1374 }
1375
1376#ifdef CONFIG_IP_MROUTE
1377 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1378 rth->dst.input = ip_mr_input;
1379#endif
1380 RT_CACHE_STAT_INC(in_slow_mc);
1381
1382 skb_dst_set(skb, &rth->dst);
1383 return 0;
1384
1385e_nobufs:
1386 return -ENOBUFS;
1387e_inval:
1388 return -EINVAL;
1389e_err:
1390 return err;
1391}
1392
1393
1394static void ip_handle_martian_source(struct net_device *dev,
1395 struct in_device *in_dev,
1396 struct sk_buff *skb,
1397 __be32 daddr,
1398 __be32 saddr)
1399{
1400 RT_CACHE_STAT_INC(in_martian_src);
1401#ifdef CONFIG_IP_ROUTE_VERBOSE
1402 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1403
1404
1405
1406
1407 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1408 &daddr, &saddr, dev->name);
1409 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1410 print_hex_dump(KERN_WARNING, "ll header: ",
1411 DUMP_PREFIX_OFFSET, 16, 1,
1412 skb_mac_header(skb),
1413 dev->hard_header_len, true);
1414 }
1415 }
1416#endif
1417}
1418
1419
1420static int __mkroute_input(struct sk_buff *skb,
1421 const struct fib_result *res,
1422 struct in_device *in_dev,
1423 __be32 daddr, __be32 saddr, u32 tos)
1424{
1425 struct rtable *rth;
1426 int err;
1427 struct in_device *out_dev;
1428 unsigned int flags = 0;
1429 bool do_cache;
1430 u32 itag;
1431
1432
1433 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1434 if (out_dev == NULL) {
1435 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1436 return -EINVAL;
1437 }
1438
1439 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1440 in_dev->dev, in_dev, &itag);
1441 if (err < 0) {
1442 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1443 saddr);
1444
1445 goto cleanup;
1446 }
1447
1448 do_cache = res->fi && !itag;
1449 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1450 (IN_DEV_SHARED_MEDIA(out_dev) ||
1451 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1452 flags |= RTCF_DOREDIRECT;
1453 do_cache = false;
1454 }
1455
1456 if (skb->protocol != htons(ETH_P_IP)) {
1457
1458
1459
1460
1461
1462
1463
1464 if (out_dev == in_dev &&
1465 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1466 err = -EINVAL;
1467 goto cleanup;
1468 }
1469 }
1470
1471 if (do_cache) {
1472 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1473 if (rt_cache_valid(rth)) {
1474 skb_dst_set_noref(skb, &rth->dst);
1475 goto out;
1476 }
1477 }
1478
1479 rth = rt_dst_alloc(out_dev->dev,
1480 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1481 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1482 if (!rth) {
1483 err = -ENOBUFS;
1484 goto cleanup;
1485 }
1486
1487 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1488 rth->rt_flags = flags;
1489 rth->rt_type = res->type;
1490 rth->rt_is_input = 1;
1491 rth->rt_iif = 0;
1492 rth->rt_pmtu = 0;
1493 rth->rt_gateway = 0;
1494 rth->rt_uses_gateway = 0;
1495 INIT_LIST_HEAD(&rth->rt_uncached);
1496
1497 rth->dst.input = ip_forward;
1498 rth->dst.output = ip_output;
1499
1500 rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1501 skb_dst_set(skb, &rth->dst);
1502out:
1503 err = 0;
1504 cleanup:
1505 return err;
1506}
1507
1508static int ip_mkroute_input(struct sk_buff *skb,
1509 struct fib_result *res,
1510 const struct flowi4 *fl4,
1511 struct in_device *in_dev,
1512 __be32 daddr, __be32 saddr, u32 tos)
1513{
1514#ifdef CONFIG_IP_ROUTE_MULTIPATH
1515 if (res->fi && res->fi->fib_nhs > 1)
1516 fib_select_multipath(res);
1517#endif
1518
1519
1520 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1521}
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1535 u8 tos, struct net_device *dev)
1536{
1537 struct fib_result res;
1538 struct in_device *in_dev = __in_dev_get_rcu(dev);
1539 struct flowi4 fl4;
1540 unsigned int flags = 0;
1541 u32 itag = 0;
1542 struct rtable *rth;
1543 int err = -EINVAL;
1544 struct net *net = dev_net(dev);
1545 bool do_cache;
1546
1547
1548
1549 if (!in_dev)
1550 goto out;
1551
1552
1553
1554
1555
1556 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1557 goto martian_source;
1558
1559 res.fi = NULL;
1560 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1561 goto brd_input;
1562
1563
1564
1565
1566 if (ipv4_is_zeronet(saddr))
1567 goto martian_source;
1568
1569 if (ipv4_is_zeronet(daddr))
1570 goto martian_destination;
1571
1572 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
1573 if (ipv4_is_loopback(daddr))
1574 goto martian_destination;
1575
1576 if (ipv4_is_loopback(saddr))
1577 goto martian_source;
1578 }
1579
1580
1581
1582
1583 fl4.flowi4_oif = 0;
1584 fl4.flowi4_iif = dev->ifindex;
1585 fl4.flowi4_mark = skb->mark;
1586 fl4.flowi4_tos = tos;
1587 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1588 fl4.daddr = daddr;
1589 fl4.saddr = saddr;
1590 err = fib_lookup(net, &fl4, &res);
1591 if (err != 0)
1592 goto no_route;
1593
1594 RT_CACHE_STAT_INC(in_slow_tot);
1595
1596 if (res.type == RTN_BROADCAST)
1597 goto brd_input;
1598
1599 if (res.type == RTN_LOCAL) {
1600 err = fib_validate_source(skb, saddr, daddr, tos,
1601 net->loopback_dev->ifindex,
1602 dev, in_dev, &itag);
1603 if (err < 0)
1604 goto martian_source_keep_err;
1605 goto local_input;
1606 }
1607
1608 if (!IN_DEV_FORWARD(in_dev))
1609 goto no_route;
1610 if (res.type != RTN_UNICAST)
1611 goto martian_destination;
1612
1613 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1614out: return err;
1615
1616brd_input:
1617 if (skb->protocol != htons(ETH_P_IP))
1618 goto e_inval;
1619
1620 if (!ipv4_is_zeronet(saddr)) {
1621 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1622 in_dev, &itag);
1623 if (err < 0)
1624 goto martian_source_keep_err;
1625 }
1626 flags |= RTCF_BROADCAST;
1627 res.type = RTN_BROADCAST;
1628 RT_CACHE_STAT_INC(in_brd);
1629
1630local_input:
1631 do_cache = false;
1632 if (res.fi) {
1633 if (!itag) {
1634 rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1635 if (rt_cache_valid(rth)) {
1636 skb_dst_set_noref(skb, &rth->dst);
1637 err = 0;
1638 goto out;
1639 }
1640 do_cache = true;
1641 }
1642 }
1643
1644 rth = rt_dst_alloc(net->loopback_dev,
1645 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1646 if (!rth)
1647 goto e_nobufs;
1648
1649 rth->dst.input= ip_local_deliver;
1650 rth->dst.output= ip_rt_bug;
1651#ifdef CONFIG_IP_ROUTE_CLASSID
1652 rth->dst.tclassid = itag;
1653#endif
1654
1655 rth->rt_genid = rt_genid(net);
1656 rth->rt_flags = flags|RTCF_LOCAL;
1657 rth->rt_type = res.type;
1658 rth->rt_is_input = 1;
1659 rth->rt_iif = 0;
1660 rth->rt_pmtu = 0;
1661 rth->rt_gateway = 0;
1662 rth->rt_uses_gateway = 0;
1663 INIT_LIST_HEAD(&rth->rt_uncached);
1664 if (res.type == RTN_UNREACHABLE) {
1665 rth->dst.input= ip_error;
1666 rth->dst.error= -err;
1667 rth->rt_flags &= ~RTCF_LOCAL;
1668 }
1669 if (do_cache)
1670 rt_cache_route(&FIB_RES_NH(res), rth);
1671 skb_dst_set(skb, &rth->dst);
1672 err = 0;
1673 goto out;
1674
1675no_route:
1676 RT_CACHE_STAT_INC(in_no_route);
1677 res.type = RTN_UNREACHABLE;
1678 if (err == -ESRCH)
1679 err = -ENETUNREACH;
1680 goto local_input;
1681
1682
1683
1684
1685martian_destination:
1686 RT_CACHE_STAT_INC(in_martian_dst);
1687#ifdef CONFIG_IP_ROUTE_VERBOSE
1688 if (IN_DEV_LOG_MARTIANS(in_dev))
1689 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1690 &daddr, &saddr, dev->name);
1691#endif
1692
1693e_inval:
1694 err = -EINVAL;
1695 goto out;
1696
1697e_nobufs:
1698 err = -ENOBUFS;
1699 goto out;
1700
1701martian_source:
1702 err = -EINVAL;
1703martian_source_keep_err:
1704 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1705 goto out;
1706}
1707
1708int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1709 u8 tos, struct net_device *dev)
1710{
1711 int res;
1712
1713 rcu_read_lock();
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726 if (ipv4_is_multicast(daddr)) {
1727 struct in_device *in_dev = __in_dev_get_rcu(dev);
1728
1729 if (in_dev) {
1730 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1731 ip_hdr(skb)->protocol);
1732 if (our
1733#ifdef CONFIG_IP_MROUTE
1734 ||
1735 (!ipv4_is_local_multicast(daddr) &&
1736 IN_DEV_MFORWARD(in_dev))
1737#endif
1738 ) {
1739 int res = ip_route_input_mc(skb, daddr, saddr,
1740 tos, dev, our);
1741 rcu_read_unlock();
1742 return res;
1743 }
1744 }
1745 rcu_read_unlock();
1746 return -EINVAL;
1747 }
1748 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1749 rcu_read_unlock();
1750 return res;
1751}
1752EXPORT_SYMBOL(ip_route_input_noref);
1753
1754
1755static struct rtable *__mkroute_output(const struct fib_result *res,
1756 const struct flowi4 *fl4, int orig_oif,
1757 struct net_device *dev_out,
1758 unsigned int flags)
1759{
1760 struct fib_info *fi = res->fi;
1761 struct fib_nh_exception *fnhe;
1762 struct in_device *in_dev;
1763 u16 type = res->type;
1764 struct rtable *rth;
1765 bool do_cache;
1766
1767 in_dev = __in_dev_get_rcu(dev_out);
1768 if (!in_dev)
1769 return ERR_PTR(-EINVAL);
1770
1771 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1772 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1773 return ERR_PTR(-EINVAL);
1774
1775 if (ipv4_is_lbcast(fl4->daddr))
1776 type = RTN_BROADCAST;
1777 else if (ipv4_is_multicast(fl4->daddr))
1778 type = RTN_MULTICAST;
1779 else if (ipv4_is_zeronet(fl4->daddr))
1780 return ERR_PTR(-EINVAL);
1781
1782 if (dev_out->flags & IFF_LOOPBACK)
1783 flags |= RTCF_LOCAL;
1784
1785 if (type == RTN_BROADCAST) {
1786 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1787 fi = NULL;
1788 } else if (type == RTN_MULTICAST) {
1789 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1790 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1791 fl4->flowi4_proto))
1792 flags &= ~RTCF_LOCAL;
1793
1794
1795
1796
1797 if (fi && res->prefixlen < 4)
1798 fi = NULL;
1799 }
1800
1801 fnhe = NULL;
1802 do_cache = fi != NULL;
1803 if (fi) {
1804 struct rtable __rcu **prth;
1805 struct fib_nh *nh = &FIB_RES_NH(*res);
1806
1807 fnhe = find_exception(nh, fl4->daddr);
1808 if (fnhe)
1809 prth = &fnhe->fnhe_rth;
1810 else {
1811 if (unlikely(fl4->flowi4_flags &
1812 FLOWI_FLAG_KNOWN_NH &&
1813 !(nh->nh_gw &&
1814 nh->nh_scope == RT_SCOPE_LINK))) {
1815 do_cache = false;
1816 goto add;
1817 }
1818 prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1819 }
1820 rth = rcu_dereference(*prth);
1821 if (rt_cache_valid(rth)) {
1822 dst_hold(&rth->dst);
1823 return rth;
1824 }
1825 }
1826
1827add:
1828 rth = rt_dst_alloc(dev_out,
1829 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1830 IN_DEV_CONF_GET(in_dev, NOXFRM),
1831 do_cache);
1832 if (!rth)
1833 return ERR_PTR(-ENOBUFS);
1834
1835 rth->dst.output = ip_output;
1836
1837 rth->rt_genid = rt_genid(dev_net(dev_out));
1838 rth->rt_flags = flags;
1839 rth->rt_type = type;
1840 rth->rt_is_input = 0;
1841 rth->rt_iif = orig_oif ? : 0;
1842 rth->rt_pmtu = 0;
1843 rth->rt_gateway = 0;
1844 rth->rt_uses_gateway = 0;
1845 INIT_LIST_HEAD(&rth->rt_uncached);
1846
1847 RT_CACHE_STAT_INC(out_slow_tot);
1848
1849 if (flags & RTCF_LOCAL)
1850 rth->dst.input = ip_local_deliver;
1851 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1852 if (flags & RTCF_LOCAL &&
1853 !(dev_out->flags & IFF_LOOPBACK)) {
1854 rth->dst.output = ip_mc_output;
1855 RT_CACHE_STAT_INC(out_slow_mc);
1856 }
1857#ifdef CONFIG_IP_MROUTE
1858 if (type == RTN_MULTICAST) {
1859 if (IN_DEV_MFORWARD(in_dev) &&
1860 !ipv4_is_local_multicast(fl4->daddr)) {
1861 rth->dst.input = ip_mr_input;
1862 rth->dst.output = ip_mc_output;
1863 }
1864 }
1865#endif
1866 }
1867
1868 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1869
1870 return rth;
1871}
1872
1873
1874
1875
1876
1877struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1878{
1879 struct net_device *dev_out = NULL;
1880 __u8 tos = RT_FL_TOS(fl4);
1881 unsigned int flags = 0;
1882 struct fib_result res;
1883 struct rtable *rth;
1884 int orig_oif;
1885
1886 res.tclassid = 0;
1887 res.fi = NULL;
1888 res.table = NULL;
1889
1890 orig_oif = fl4->flowi4_oif;
1891
1892 fl4->flowi4_iif = net->loopback_dev->ifindex;
1893 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1894 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1895 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1896
1897 rcu_read_lock();
1898 if (fl4->saddr) {
1899 rth = ERR_PTR(-EINVAL);
1900 if (ipv4_is_multicast(fl4->saddr) ||
1901 ipv4_is_lbcast(fl4->saddr) ||
1902 ipv4_is_zeronet(fl4->saddr))
1903 goto out;
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913 if (fl4->flowi4_oif == 0 &&
1914 (ipv4_is_multicast(fl4->daddr) ||
1915 ipv4_is_lbcast(fl4->daddr))) {
1916
1917 dev_out = __ip_dev_find(net, fl4->saddr, false);
1918 if (dev_out == NULL)
1919 goto out;
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936 fl4->flowi4_oif = dev_out->ifindex;
1937 goto make_route;
1938 }
1939
1940 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1941
1942 if (!__ip_dev_find(net, fl4->saddr, false))
1943 goto out;
1944 }
1945 }
1946
1947
1948 if (fl4->flowi4_oif) {
1949 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
1950 rth = ERR_PTR(-ENODEV);
1951 if (dev_out == NULL)
1952 goto out;
1953
1954
1955 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
1956 rth = ERR_PTR(-ENETUNREACH);
1957 goto out;
1958 }
1959 if (ipv4_is_local_multicast(fl4->daddr) ||
1960 ipv4_is_lbcast(fl4->daddr)) {
1961 if (!fl4->saddr)
1962 fl4->saddr = inet_select_addr(dev_out, 0,
1963 RT_SCOPE_LINK);
1964 goto make_route;
1965 }
1966 if (fl4->saddr) {
1967 if (ipv4_is_multicast(fl4->daddr))
1968 fl4->saddr = inet_select_addr(dev_out, 0,
1969 fl4->flowi4_scope);
1970 else if (!fl4->daddr)
1971 fl4->saddr = inet_select_addr(dev_out, 0,
1972 RT_SCOPE_HOST);
1973 }
1974 }
1975
1976 if (!fl4->daddr) {
1977 fl4->daddr = fl4->saddr;
1978 if (!fl4->daddr)
1979 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1980 dev_out = net->loopback_dev;
1981 fl4->flowi4_oif = net->loopback_dev->ifindex;
1982 res.type = RTN_LOCAL;
1983 flags |= RTCF_LOCAL;
1984 goto make_route;
1985 }
1986
1987 if (fib_lookup(net, fl4, &res)) {
1988 res.fi = NULL;
1989 res.table = NULL;
1990 if (fl4->flowi4_oif) {
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009 if (fl4->saddr == 0)
2010 fl4->saddr = inet_select_addr(dev_out, 0,
2011 RT_SCOPE_LINK);
2012 res.type = RTN_UNICAST;
2013 goto make_route;
2014 }
2015 rth = ERR_PTR(-ENETUNREACH);
2016 goto out;
2017 }
2018
2019 if (res.type == RTN_LOCAL) {
2020 if (!fl4->saddr) {
2021 if (res.fi->fib_prefsrc)
2022 fl4->saddr = res.fi->fib_prefsrc;
2023 else
2024 fl4->saddr = fl4->daddr;
2025 }
2026 dev_out = net->loopback_dev;
2027 fl4->flowi4_oif = dev_out->ifindex;
2028 flags |= RTCF_LOCAL;
2029 goto make_route;
2030 }
2031
2032#ifdef CONFIG_IP_ROUTE_MULTIPATH
2033 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2034 fib_select_multipath(&res);
2035 else
2036#endif
2037 if (!res.prefixlen &&
2038 res.table->tb_num_default > 1 &&
2039 res.type == RTN_UNICAST && !fl4->flowi4_oif)
2040 fib_select_default(&res);
2041
2042 if (!fl4->saddr)
2043 fl4->saddr = FIB_RES_PREFSRC(net, res);
2044
2045 dev_out = FIB_RES_DEV(res);
2046 fl4->flowi4_oif = dev_out->ifindex;
2047
2048
2049make_route:
2050 rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2051
2052out:
2053 rcu_read_unlock();
2054 return rth;
2055}
2056EXPORT_SYMBOL_GPL(__ip_route_output_key);
2057
2058static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2059{
2060 return NULL;
2061}
2062
2063static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2064{
2065 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2066
2067 return mtu ? : dst->dev->mtu;
2068}
2069
2070static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2071 struct sk_buff *skb, u32 mtu)
2072{
2073}
2074
2075static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2076 struct sk_buff *skb)
2077{
2078}
2079
2080static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2081 unsigned long old)
2082{
2083 return NULL;
2084}
2085
2086static struct dst_ops ipv4_dst_blackhole_ops = {
2087 .family = AF_INET,
2088 .protocol = cpu_to_be16(ETH_P_IP),
2089 .check = ipv4_blackhole_dst_check,
2090 .mtu = ipv4_blackhole_mtu,
2091 .default_advmss = ipv4_default_advmss,
2092 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2093 .redirect = ipv4_rt_blackhole_redirect,
2094 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2095 .neigh_lookup = ipv4_neigh_lookup,
2096};
2097
2098struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2099{
2100 struct rtable *ort = (struct rtable *) dst_orig;
2101 struct rtable *rt;
2102
2103 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2104 if (rt) {
2105 struct dst_entry *new = &rt->dst;
2106
2107 new->__use = 1;
2108 new->input = dst_discard;
2109 new->output = dst_discard;
2110
2111 new->dev = ort->dst.dev;
2112 if (new->dev)
2113 dev_hold(new->dev);
2114
2115 rt->rt_is_input = ort->rt_is_input;
2116 rt->rt_iif = ort->rt_iif;
2117 rt->rt_pmtu = ort->rt_pmtu;
2118
2119 rt->rt_genid = rt_genid(net);
2120 rt->rt_flags = ort->rt_flags;
2121 rt->rt_type = ort->rt_type;
2122 rt->rt_gateway = ort->rt_gateway;
2123 rt->rt_uses_gateway = ort->rt_uses_gateway;
2124
2125 INIT_LIST_HEAD(&rt->rt_uncached);
2126
2127 dst_free(new);
2128 }
2129
2130 dst_release(dst_orig);
2131
2132 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2133}
2134
2135struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2136 struct sock *sk)
2137{
2138 struct rtable *rt = __ip_route_output_key(net, flp4);
2139
2140 if (IS_ERR(rt))
2141 return rt;
2142
2143 if (flp4->flowi4_proto)
2144 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2145 flowi4_to_flowi(flp4),
2146 sk, 0);
2147
2148 return rt;
2149}
2150EXPORT_SYMBOL_GPL(ip_route_output_flow);
2151
2152static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2153 struct flowi4 *fl4, struct sk_buff *skb, u32 pid,
2154 u32 seq, int event, int nowait, unsigned int flags)
2155{
2156 struct rtable *rt = skb_rtable(skb);
2157 struct rtmsg *r;
2158 struct nlmsghdr *nlh;
2159 unsigned long expires = 0;
2160 u32 error;
2161 u32 metrics[RTAX_MAX];
2162
2163 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2164 if (nlh == NULL)
2165 return -EMSGSIZE;
2166
2167 r = nlmsg_data(nlh);
2168 r->rtm_family = AF_INET;
2169 r->rtm_dst_len = 32;
2170 r->rtm_src_len = 0;
2171 r->rtm_tos = fl4->flowi4_tos;
2172 r->rtm_table = RT_TABLE_MAIN;
2173 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2174 goto nla_put_failure;
2175 r->rtm_type = rt->rt_type;
2176 r->rtm_scope = RT_SCOPE_UNIVERSE;
2177 r->rtm_protocol = RTPROT_UNSPEC;
2178 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2179 if (rt->rt_flags & RTCF_NOTIFY)
2180 r->rtm_flags |= RTM_F_NOTIFY;
2181
2182 if (nla_put_be32(skb, RTA_DST, dst))
2183 goto nla_put_failure;
2184 if (src) {
2185 r->rtm_src_len = 32;
2186 if (nla_put_be32(skb, RTA_SRC, src))
2187 goto nla_put_failure;
2188 }
2189 if (rt->dst.dev &&
2190 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2191 goto nla_put_failure;
2192#ifdef CONFIG_IP_ROUTE_CLASSID
2193 if (rt->dst.tclassid &&
2194 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2195 goto nla_put_failure;
2196#endif
2197 if (!rt_is_input_route(rt) &&
2198 fl4->saddr != src) {
2199 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2200 goto nla_put_failure;
2201 }
2202 if (rt->rt_uses_gateway &&
2203 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2204 goto nla_put_failure;
2205
2206 expires = rt->dst.expires;
2207 if (expires) {
2208 unsigned long now = jiffies;
2209
2210 if (time_before(now, expires))
2211 expires -= now;
2212 else
2213 expires = 0;
2214 }
2215
2216 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2217 if (rt->rt_pmtu && expires)
2218 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2219 if (rtnetlink_put_metrics(skb, metrics) < 0)
2220 goto nla_put_failure;
2221
2222 if (fl4->flowi4_mark &&
2223 nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
2224 goto nla_put_failure;
2225
2226 error = rt->dst.error;
2227
2228 if (rt_is_input_route(rt)) {
2229 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2230 goto nla_put_failure;
2231 }
2232
2233 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2234 goto nla_put_failure;
2235
2236 return nlmsg_end(skb, nlh);
2237
2238nla_put_failure:
2239 nlmsg_cancel(skb, nlh);
2240 return -EMSGSIZE;
2241}
2242
2243static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2244{
2245 struct net *net = sock_net(in_skb->sk);
2246 struct rtmsg *rtm;
2247 struct nlattr *tb[RTA_MAX+1];
2248 struct rtable *rt = NULL;
2249 struct flowi4 fl4;
2250 __be32 dst = 0;
2251 __be32 src = 0;
2252 u32 iif;
2253 int err;
2254 int mark;
2255 struct sk_buff *skb;
2256
2257 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2258 if (err < 0)
2259 goto errout;
2260
2261 rtm = nlmsg_data(nlh);
2262
2263 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2264 if (skb == NULL) {
2265 err = -ENOBUFS;
2266 goto errout;
2267 }
2268
2269
2270
2271
2272 skb_reset_mac_header(skb);
2273 skb_reset_network_header(skb);
2274
2275
2276 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2277 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2278
2279 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2280 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2281 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2282 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2283
2284 memset(&fl4, 0, sizeof(fl4));
2285 fl4.daddr = dst;
2286 fl4.saddr = src;
2287 fl4.flowi4_tos = rtm->rtm_tos;
2288 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2289 fl4.flowi4_mark = mark;
2290
2291 if (iif) {
2292 struct net_device *dev;
2293
2294 dev = __dev_get_by_index(net, iif);
2295 if (dev == NULL) {
2296 err = -ENODEV;
2297 goto errout_free;
2298 }
2299
2300 skb->protocol = htons(ETH_P_IP);
2301 skb->dev = dev;
2302 skb->mark = mark;
2303 local_bh_disable();
2304 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2305 local_bh_enable();
2306
2307 rt = skb_rtable(skb);
2308 if (err == 0 && rt->dst.error)
2309 err = -rt->dst.error;
2310 } else {
2311 rt = ip_route_output_key(net, &fl4);
2312
2313 err = 0;
2314 if (IS_ERR(rt))
2315 err = PTR_ERR(rt);
2316 }
2317
2318 if (err)
2319 goto errout_free;
2320
2321 skb_dst_set(skb, &rt->dst);
2322 if (rtm->rtm_flags & RTM_F_NOTIFY)
2323 rt->rt_flags |= RTCF_NOTIFY;
2324
2325 err = rt_fill_info(net, dst, src, &fl4, skb,
2326 NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2327 RTM_NEWROUTE, 0, 0);
2328 if (err <= 0)
2329 goto errout_free;
2330
2331 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2332errout:
2333 return err;
2334
2335errout_free:
2336 kfree_skb(skb);
2337 goto errout;
2338}
2339
2340int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2341{
2342 return skb->len;
2343}
2344
2345void ip_rt_multicast_event(struct in_device *in_dev)
2346{
2347 rt_cache_flush(dev_net(in_dev->dev));
2348}
2349
2350#ifdef CONFIG_SYSCTL
2351static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2352 void __user *buffer,
2353 size_t *lenp, loff_t *ppos)
2354{
2355 if (write) {
2356 rt_cache_flush((struct net *)__ctl->extra1);
2357 return 0;
2358 }
2359
2360 return -EINVAL;
2361}
2362
2363static ctl_table ipv4_route_table[] = {
2364 {
2365 .procname = "gc_thresh",
2366 .data = &ipv4_dst_ops.gc_thresh,
2367 .maxlen = sizeof(int),
2368 .mode = 0644,
2369 .proc_handler = proc_dointvec,
2370 },
2371 {
2372 .procname = "max_size",
2373 .data = &ip_rt_max_size,
2374 .maxlen = sizeof(int),
2375 .mode = 0644,
2376 .proc_handler = proc_dointvec,
2377 },
2378 {
2379
2380
2381 .procname = "gc_min_interval",
2382 .data = &ip_rt_gc_min_interval,
2383 .maxlen = sizeof(int),
2384 .mode = 0644,
2385 .proc_handler = proc_dointvec_jiffies,
2386 },
2387 {
2388 .procname = "gc_min_interval_ms",
2389 .data = &ip_rt_gc_min_interval,
2390 .maxlen = sizeof(int),
2391 .mode = 0644,
2392 .proc_handler = proc_dointvec_ms_jiffies,
2393 },
2394 {
2395 .procname = "gc_timeout",
2396 .data = &ip_rt_gc_timeout,
2397 .maxlen = sizeof(int),
2398 .mode = 0644,
2399 .proc_handler = proc_dointvec_jiffies,
2400 },
2401 {
2402 .procname = "gc_interval",
2403 .data = &ip_rt_gc_interval,
2404 .maxlen = sizeof(int),
2405 .mode = 0644,
2406 .proc_handler = proc_dointvec_jiffies,
2407 },
2408 {
2409 .procname = "redirect_load",
2410 .data = &ip_rt_redirect_load,
2411 .maxlen = sizeof(int),
2412 .mode = 0644,
2413 .proc_handler = proc_dointvec,
2414 },
2415 {
2416 .procname = "redirect_number",
2417 .data = &ip_rt_redirect_number,
2418 .maxlen = sizeof(int),
2419 .mode = 0644,
2420 .proc_handler = proc_dointvec,
2421 },
2422 {
2423 .procname = "redirect_silence",
2424 .data = &ip_rt_redirect_silence,
2425 .maxlen = sizeof(int),
2426 .mode = 0644,
2427 .proc_handler = proc_dointvec,
2428 },
2429 {
2430 .procname = "error_cost",
2431 .data = &ip_rt_error_cost,
2432 .maxlen = sizeof(int),
2433 .mode = 0644,
2434 .proc_handler = proc_dointvec,
2435 },
2436 {
2437 .procname = "error_burst",
2438 .data = &ip_rt_error_burst,
2439 .maxlen = sizeof(int),
2440 .mode = 0644,
2441 .proc_handler = proc_dointvec,
2442 },
2443 {
2444 .procname = "gc_elasticity",
2445 .data = &ip_rt_gc_elasticity,
2446 .maxlen = sizeof(int),
2447 .mode = 0644,
2448 .proc_handler = proc_dointvec,
2449 },
2450 {
2451 .procname = "mtu_expires",
2452 .data = &ip_rt_mtu_expires,
2453 .maxlen = sizeof(int),
2454 .mode = 0644,
2455 .proc_handler = proc_dointvec_jiffies,
2456 },
2457 {
2458 .procname = "min_pmtu",
2459 .data = &ip_rt_min_pmtu,
2460 .maxlen = sizeof(int),
2461 .mode = 0644,
2462 .proc_handler = proc_dointvec,
2463 },
2464 {
2465 .procname = "min_adv_mss",
2466 .data = &ip_rt_min_advmss,
2467 .maxlen = sizeof(int),
2468 .mode = 0644,
2469 .proc_handler = proc_dointvec,
2470 },
2471 { }
2472};
2473
2474static struct ctl_table ipv4_route_flush_table[] = {
2475 {
2476 .procname = "flush",
2477 .maxlen = sizeof(int),
2478 .mode = 0200,
2479 .proc_handler = ipv4_sysctl_rtcache_flush,
2480 },
2481 { },
2482};
2483
2484static __net_init int sysctl_route_net_init(struct net *net)
2485{
2486 struct ctl_table *tbl;
2487
2488 tbl = ipv4_route_flush_table;
2489 if (!net_eq(net, &init_net)) {
2490 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2491 if (tbl == NULL)
2492 goto err_dup;
2493 }
2494 tbl[0].extra1 = net;
2495
2496 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2497 if (net->ipv4.route_hdr == NULL)
2498 goto err_reg;
2499 return 0;
2500
2501err_reg:
2502 if (tbl != ipv4_route_flush_table)
2503 kfree(tbl);
2504err_dup:
2505 return -ENOMEM;
2506}
2507
2508static __net_exit void sysctl_route_net_exit(struct net *net)
2509{
2510 struct ctl_table *tbl;
2511
2512 tbl = net->ipv4.route_hdr->ctl_table_arg;
2513 unregister_net_sysctl_table(net->ipv4.route_hdr);
2514 BUG_ON(tbl == ipv4_route_flush_table);
2515 kfree(tbl);
2516}
2517
2518static __net_initdata struct pernet_operations sysctl_route_ops = {
2519 .init = sysctl_route_net_init,
2520 .exit = sysctl_route_net_exit,
2521};
2522#endif
2523
2524static __net_init int rt_genid_init(struct net *net)
2525{
2526 atomic_set(&net->rt_genid, 0);
2527 get_random_bytes(&net->ipv4.dev_addr_genid,
2528 sizeof(net->ipv4.dev_addr_genid));
2529 return 0;
2530}
2531
2532static __net_initdata struct pernet_operations rt_genid_ops = {
2533 .init = rt_genid_init,
2534};
2535
2536static int __net_init ipv4_inetpeer_init(struct net *net)
2537{
2538 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2539
2540 if (!bp)
2541 return -ENOMEM;
2542 inet_peer_base_init(bp);
2543 net->ipv4.peers = bp;
2544 return 0;
2545}
2546
2547static void __net_exit ipv4_inetpeer_exit(struct net *net)
2548{
2549 struct inet_peer_base *bp = net->ipv4.peers;
2550
2551 net->ipv4.peers = NULL;
2552 inetpeer_invalidate_tree(bp);
2553 kfree(bp);
2554}
2555
2556static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2557 .init = ipv4_inetpeer_init,
2558 .exit = ipv4_inetpeer_exit,
2559};
2560
2561#ifdef CONFIG_IP_ROUTE_CLASSID
2562struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2563#endif
2564
2565int __init ip_rt_init(void)
2566{
2567 int rc = 0;
2568
2569#ifdef CONFIG_IP_ROUTE_CLASSID
2570 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2571 if (!ip_rt_acct)
2572 panic("IP: failed to allocate ip_rt_acct\n");
2573#endif
2574
2575 ipv4_dst_ops.kmem_cachep =
2576 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2577 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2578
2579 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2580
2581 if (dst_entries_init(&ipv4_dst_ops) < 0)
2582 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2583
2584 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2585 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2586
2587 ipv4_dst_ops.gc_thresh = ~0;
2588 ip_rt_max_size = INT_MAX;
2589
2590 devinet_init();
2591 ip_fib_init();
2592
2593 if (ip_rt_proc_init())
2594 pr_err("Unable to create route proc files\n");
2595#ifdef CONFIG_XFRM
2596 xfrm_init();
2597 xfrm4_init(ip_rt_max_size);
2598#endif
2599 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2600
2601#ifdef CONFIG_SYSCTL
2602 register_pernet_subsys(&sysctl_route_ops);
2603#endif
2604 register_pernet_subsys(&rt_genid_ops);
2605 register_pernet_subsys(&ipv4_inetpeer_ops);
2606 return rc;
2607}
2608
2609#ifdef CONFIG_SYSCTL
2610
2611
2612
2613
2614void __init ip_static_sysctl_init(void)
2615{
2616 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2617}
2618#endif
2619