1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65#define pr_fmt(fmt) "IPv4: " fmt
66
67#include <linux/module.h>
68#include <asm/uaccess.h>
69#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
72#include <linux/mm.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/skbuff.h>
83#include <linux/inetdevice.h>
84#include <linux/igmp.h>
85#include <linux/pkt_sched.h>
86#include <linux/mroute.h>
87#include <linux/netfilter_ipv4.h>
88#include <linux/random.h>
89#include <linux/rcupdate.h>
90#include <linux/times.h>
91#include <linux/slab.h>
92#include <net/dst.h>
93#include <net/net_namespace.h>
94#include <net/protocol.h>
95#include <net/ip.h>
96#include <net/route.h>
97#include <net/inetpeer.h>
98#include <net/sock.h>
99#include <net/ip_fib.h>
100#include <net/arp.h>
101#include <net/tcp.h>
102#include <net/icmp.h>
103#include <net/xfrm.h>
104#include <net/netevent.h>
105#include <net/rtnetlink.h>
106#ifdef CONFIG_SYSCTL
107#include <linux/sysctl.h>
108#include <linux/kmemleak.h>
109#endif
110#include <net/secure_seq.h>
111
112#define RT_FL_TOS(oldflp4) \
113 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114
115#define IP_MAX_MTU 0xFFF0
116
117#define RT_GC_TIMEOUT (300*HZ)
118
119static int ip_rt_max_size;
120static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
121static int ip_rt_gc_interval __read_mostly = 60 * HZ;
122static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
123static int ip_rt_redirect_number __read_mostly = 9;
124static int ip_rt_redirect_load __read_mostly = HZ / 50;
125static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126static int ip_rt_error_cost __read_mostly = HZ;
127static int ip_rt_error_burst __read_mostly = 5 * HZ;
128static int ip_rt_gc_elasticity __read_mostly = 8;
129static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
130static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131static int ip_rt_min_advmss __read_mostly = 256;
132
133
134
135
136
137static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
139static unsigned int ipv4_mtu(const struct dst_entry *dst);
140static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141static void ipv4_link_failure(struct sk_buff *skb);
142static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143 struct sk_buff *skb, u32 mtu);
144static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145 struct sk_buff *skb);
146static void ipv4_dst_destroy(struct dst_entry *dst);
147
148static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
149 int how)
150{
151}
152
153static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154{
155 WARN_ON(1);
156 return NULL;
157}
158
159static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
160 struct sk_buff *skb,
161 const void *daddr);
162
163static struct dst_ops ipv4_dst_ops = {
164 .family = AF_INET,
165 .protocol = cpu_to_be16(ETH_P_IP),
166 .check = ipv4_dst_check,
167 .default_advmss = ipv4_default_advmss,
168 .mtu = ipv4_mtu,
169 .cow_metrics = ipv4_cow_metrics,
170 .destroy = ipv4_dst_destroy,
171 .ifdown = ipv4_dst_ifdown,
172 .negative_advice = ipv4_negative_advice,
173 .link_failure = ipv4_link_failure,
174 .update_pmtu = ip_rt_update_pmtu,
175 .redirect = ip_do_redirect,
176 .local_out = __ip_local_out,
177 .neigh_lookup = ipv4_neigh_lookup,
178};
179
180#define ECN_OR_COST(class) TC_PRIO_##class
181
182const __u8 ip_tos2prio[16] = {
183 TC_PRIO_BESTEFFORT,
184 ECN_OR_COST(BESTEFFORT),
185 TC_PRIO_BESTEFFORT,
186 ECN_OR_COST(BESTEFFORT),
187 TC_PRIO_BULK,
188 ECN_OR_COST(BULK),
189 TC_PRIO_BULK,
190 ECN_OR_COST(BULK),
191 TC_PRIO_INTERACTIVE,
192 ECN_OR_COST(INTERACTIVE),
193 TC_PRIO_INTERACTIVE,
194 ECN_OR_COST(INTERACTIVE),
195 TC_PRIO_INTERACTIVE_BULK,
196 ECN_OR_COST(INTERACTIVE_BULK),
197 TC_PRIO_INTERACTIVE_BULK,
198 ECN_OR_COST(INTERACTIVE_BULK)
199};
200EXPORT_SYMBOL(ip_tos2prio);
201
202static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
203#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
204
205#ifdef CONFIG_PROC_FS
206static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
207{
208 if (*pos)
209 return NULL;
210 return SEQ_START_TOKEN;
211}
212
213static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
214{
215 ++*pos;
216 return NULL;
217}
218
219static void rt_cache_seq_stop(struct seq_file *seq, void *v)
220{
221}
222
223static int rt_cache_seq_show(struct seq_file *seq, void *v)
224{
225 if (v == SEQ_START_TOKEN)
226 seq_printf(seq, "%-127s\n",
227 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
228 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
229 "HHUptod\tSpecDst");
230 return 0;
231}
232
233static const struct seq_operations rt_cache_seq_ops = {
234 .start = rt_cache_seq_start,
235 .next = rt_cache_seq_next,
236 .stop = rt_cache_seq_stop,
237 .show = rt_cache_seq_show,
238};
239
240static int rt_cache_seq_open(struct inode *inode, struct file *file)
241{
242 return seq_open(file, &rt_cache_seq_ops);
243}
244
245static const struct file_operations rt_cache_seq_fops = {
246 .owner = THIS_MODULE,
247 .open = rt_cache_seq_open,
248 .read = seq_read,
249 .llseek = seq_lseek,
250 .release = seq_release,
251};
252
253
254static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
255{
256 int cpu;
257
258 if (*pos == 0)
259 return SEQ_START_TOKEN;
260
261 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
262 if (!cpu_possible(cpu))
263 continue;
264 *pos = cpu+1;
265 return &per_cpu(rt_cache_stat, cpu);
266 }
267 return NULL;
268}
269
270static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
271{
272 int cpu;
273
274 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
275 if (!cpu_possible(cpu))
276 continue;
277 *pos = cpu+1;
278 return &per_cpu(rt_cache_stat, cpu);
279 }
280 return NULL;
281
282}
283
284static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
285{
286
287}
288
289static int rt_cpu_seq_show(struct seq_file *seq, void *v)
290{
291 struct rt_cache_stat *st = v;
292
293 if (v == SEQ_START_TOKEN) {
294 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
295 return 0;
296 }
297
298 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
299 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
300 dst_entries_get_slow(&ipv4_dst_ops),
301 st->in_hit,
302 st->in_slow_tot,
303 st->in_slow_mc,
304 st->in_no_route,
305 st->in_brd,
306 st->in_martian_dst,
307 st->in_martian_src,
308
309 st->out_hit,
310 st->out_slow_tot,
311 st->out_slow_mc,
312
313 st->gc_total,
314 st->gc_ignored,
315 st->gc_goal_miss,
316 st->gc_dst_overflow,
317 st->in_hlist_search,
318 st->out_hlist_search
319 );
320 return 0;
321}
322
323static const struct seq_operations rt_cpu_seq_ops = {
324 .start = rt_cpu_seq_start,
325 .next = rt_cpu_seq_next,
326 .stop = rt_cpu_seq_stop,
327 .show = rt_cpu_seq_show,
328};
329
330
331static int rt_cpu_seq_open(struct inode *inode, struct file *file)
332{
333 return seq_open(file, &rt_cpu_seq_ops);
334}
335
336static const struct file_operations rt_cpu_seq_fops = {
337 .owner = THIS_MODULE,
338 .open = rt_cpu_seq_open,
339 .read = seq_read,
340 .llseek = seq_lseek,
341 .release = seq_release,
342};
343
344#ifdef CONFIG_IP_ROUTE_CLASSID
345static int rt_acct_proc_show(struct seq_file *m, void *v)
346{
347 struct ip_rt_acct *dst, *src;
348 unsigned int i, j;
349
350 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
351 if (!dst)
352 return -ENOMEM;
353
354 for_each_possible_cpu(i) {
355 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
356 for (j = 0; j < 256; j++) {
357 dst[j].o_bytes += src[j].o_bytes;
358 dst[j].o_packets += src[j].o_packets;
359 dst[j].i_bytes += src[j].i_bytes;
360 dst[j].i_packets += src[j].i_packets;
361 }
362 }
363
364 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
365 kfree(dst);
366 return 0;
367}
368
369static int rt_acct_proc_open(struct inode *inode, struct file *file)
370{
371 return single_open(file, rt_acct_proc_show, NULL);
372}
373
374static const struct file_operations rt_acct_proc_fops = {
375 .owner = THIS_MODULE,
376 .open = rt_acct_proc_open,
377 .read = seq_read,
378 .llseek = seq_lseek,
379 .release = single_release,
380};
381#endif
382
383static int __net_init ip_rt_do_proc_init(struct net *net)
384{
385 struct proc_dir_entry *pde;
386
387 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
388 &rt_cache_seq_fops);
389 if (!pde)
390 goto err1;
391
392 pde = proc_create("rt_cache", S_IRUGO,
393 net->proc_net_stat, &rt_cpu_seq_fops);
394 if (!pde)
395 goto err2;
396
397#ifdef CONFIG_IP_ROUTE_CLASSID
398 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
399 if (!pde)
400 goto err3;
401#endif
402 return 0;
403
404#ifdef CONFIG_IP_ROUTE_CLASSID
405err3:
406 remove_proc_entry("rt_cache", net->proc_net_stat);
407#endif
408err2:
409 remove_proc_entry("rt_cache", net->proc_net);
410err1:
411 return -ENOMEM;
412}
413
414static void __net_exit ip_rt_do_proc_exit(struct net *net)
415{
416 remove_proc_entry("rt_cache", net->proc_net_stat);
417 remove_proc_entry("rt_cache", net->proc_net);
418#ifdef CONFIG_IP_ROUTE_CLASSID
419 remove_proc_entry("rt_acct", net->proc_net);
420#endif
421}
422
423static struct pernet_operations ip_rt_proc_ops __net_initdata = {
424 .init = ip_rt_do_proc_init,
425 .exit = ip_rt_do_proc_exit,
426};
427
428static int __init ip_rt_proc_init(void)
429{
430 return register_pernet_subsys(&ip_rt_proc_ops);
431}
432
433#else
434static inline int ip_rt_proc_init(void)
435{
436 return 0;
437}
438#endif
439
440static inline bool rt_is_expired(const struct rtable *rth)
441{
442 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
443}
444
445void rt_cache_flush(struct net *net)
446{
447 rt_genid_bump(net);
448}
449
450static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
451 struct sk_buff *skb,
452 const void *daddr)
453{
454 struct net_device *dev = dst->dev;
455 const __be32 *pkey = daddr;
456 const struct rtable *rt;
457 struct neighbour *n;
458
459 rt = (const struct rtable *) dst;
460 if (rt->rt_gateway)
461 pkey = (const __be32 *) &rt->rt_gateway;
462 else if (skb)
463 pkey = &ip_hdr(skb)->daddr;
464
465 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
466 if (n)
467 return n;
468 return neigh_create(&arp_tbl, pkey, dev);
469}
470
471
472
473
474
475
476
477
478static void ip_select_fb_ident(struct iphdr *iph)
479{
480 static DEFINE_SPINLOCK(ip_fb_id_lock);
481 static u32 ip_fallback_id;
482 u32 salt;
483
484 spin_lock_bh(&ip_fb_id_lock);
485 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
486 iph->id = htons(salt & 0xFFFF);
487 ip_fallback_id = salt;
488 spin_unlock_bh(&ip_fb_id_lock);
489}
490
491void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
492{
493 struct net *net = dev_net(dst->dev);
494 struct inet_peer *peer;
495
496 peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
497 if (peer) {
498 iph->id = htons(inet_getid(peer, more));
499 inet_putpeer(peer);
500 return;
501 }
502
503 ip_select_fb_ident(iph);
504}
505EXPORT_SYMBOL(__ip_select_ident);
506
507static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
508 const struct iphdr *iph,
509 int oif, u8 tos,
510 u8 prot, u32 mark, int flow_flags)
511{
512 if (sk) {
513 const struct inet_sock *inet = inet_sk(sk);
514
515 oif = sk->sk_bound_dev_if;
516 mark = sk->sk_mark;
517 tos = RT_CONN_FLAGS(sk);
518 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
519 }
520 flowi4_init_output(fl4, oif, mark, tos,
521 RT_SCOPE_UNIVERSE, prot,
522 flow_flags,
523 iph->daddr, iph->saddr, 0, 0);
524}
525
526static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
527 const struct sock *sk)
528{
529 const struct iphdr *iph = ip_hdr(skb);
530 int oif = skb->dev->ifindex;
531 u8 tos = RT_TOS(iph->tos);
532 u8 prot = iph->protocol;
533 u32 mark = skb->mark;
534
535 __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
536}
537
538static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
539{
540 const struct inet_sock *inet = inet_sk(sk);
541 const struct ip_options_rcu *inet_opt;
542 __be32 daddr = inet->inet_daddr;
543
544 rcu_read_lock();
545 inet_opt = rcu_dereference(inet->inet_opt);
546 if (inet_opt && inet_opt->opt.srr)
547 daddr = inet_opt->opt.faddr;
548 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
549 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
550 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
551 inet_sk_flowi_flags(sk),
552 daddr, inet->inet_saddr, 0, 0);
553 rcu_read_unlock();
554}
555
556static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
557 const struct sk_buff *skb)
558{
559 if (skb)
560 build_skb_flow_key(fl4, skb, sk);
561 else
562 build_sk_flow_key(fl4, sk);
563}
564
565static inline void rt_free(struct rtable *rt)
566{
567 call_rcu(&rt->dst.rcu_head, dst_rcu_free);
568}
569
570static DEFINE_SPINLOCK(fnhe_lock);
571
572static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
573{
574 struct fib_nh_exception *fnhe, *oldest;
575 struct rtable *orig;
576
577 oldest = rcu_dereference(hash->chain);
578 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
579 fnhe = rcu_dereference(fnhe->fnhe_next)) {
580 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
581 oldest = fnhe;
582 }
583 orig = rcu_dereference(oldest->fnhe_rth);
584 if (orig) {
585 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
586 rt_free(orig);
587 }
588 return oldest;
589}
590
591static inline u32 fnhe_hashfun(__be32 daddr)
592{
593 u32 hval;
594
595 hval = (__force u32) daddr;
596 hval ^= (hval >> 11) ^ (hval >> 22);
597
598 return hval & (FNHE_HASH_SIZE - 1);
599}
600
601static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
602 u32 pmtu, unsigned long expires)
603{
604 struct fnhe_hash_bucket *hash;
605 struct fib_nh_exception *fnhe;
606 int depth;
607 u32 hval = fnhe_hashfun(daddr);
608
609 spin_lock_bh(&fnhe_lock);
610
611 hash = nh->nh_exceptions;
612 if (!hash) {
613 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
614 if (!hash)
615 goto out_unlock;
616 nh->nh_exceptions = hash;
617 }
618
619 hash += hval;
620
621 depth = 0;
622 for (fnhe = rcu_dereference(hash->chain); fnhe;
623 fnhe = rcu_dereference(fnhe->fnhe_next)) {
624 if (fnhe->fnhe_daddr == daddr)
625 break;
626 depth++;
627 }
628
629 if (fnhe) {
630 if (gw)
631 fnhe->fnhe_gw = gw;
632 if (pmtu) {
633 fnhe->fnhe_pmtu = pmtu;
634 fnhe->fnhe_expires = expires;
635 }
636 } else {
637 if (depth > FNHE_RECLAIM_DEPTH)
638 fnhe = fnhe_oldest(hash);
639 else {
640 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
641 if (!fnhe)
642 goto out_unlock;
643
644 fnhe->fnhe_next = hash->chain;
645 rcu_assign_pointer(hash->chain, fnhe);
646 }
647 fnhe->fnhe_daddr = daddr;
648 fnhe->fnhe_gw = gw;
649 fnhe->fnhe_pmtu = pmtu;
650 fnhe->fnhe_expires = expires;
651 }
652
653 fnhe->fnhe_stamp = jiffies;
654
655out_unlock:
656 spin_unlock_bh(&fnhe_lock);
657 return;
658}
659
660static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
661 bool kill_route)
662{
663 __be32 new_gw = icmp_hdr(skb)->un.gateway;
664 __be32 old_gw = ip_hdr(skb)->saddr;
665 struct net_device *dev = skb->dev;
666 struct in_device *in_dev;
667 struct fib_result res;
668 struct neighbour *n;
669 struct net *net;
670
671 switch (icmp_hdr(skb)->code & 7) {
672 case ICMP_REDIR_NET:
673 case ICMP_REDIR_NETTOS:
674 case ICMP_REDIR_HOST:
675 case ICMP_REDIR_HOSTTOS:
676 break;
677
678 default:
679 return;
680 }
681
682 if (rt->rt_gateway != old_gw)
683 return;
684
685 in_dev = __in_dev_get_rcu(dev);
686 if (!in_dev)
687 return;
688
689 net = dev_net(dev);
690 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
691 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
692 ipv4_is_zeronet(new_gw))
693 goto reject_redirect;
694
695 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
696 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
697 goto reject_redirect;
698 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
699 goto reject_redirect;
700 } else {
701 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
702 goto reject_redirect;
703 }
704
705 n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
706 if (n) {
707 if (!(n->nud_state & NUD_VALID)) {
708 neigh_event_send(n, NULL);
709 } else {
710 if (fib_lookup(net, fl4, &res) == 0) {
711 struct fib_nh *nh = &FIB_RES_NH(res);
712
713 update_or_create_fnhe(nh, fl4->daddr, new_gw,
714 0, 0);
715 }
716 if (kill_route)
717 rt->dst.obsolete = DST_OBSOLETE_KILL;
718 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
719 }
720 neigh_release(n);
721 }
722 return;
723
724reject_redirect:
725#ifdef CONFIG_IP_ROUTE_VERBOSE
726 if (IN_DEV_LOG_MARTIANS(in_dev)) {
727 const struct iphdr *iph = (const struct iphdr *) skb->data;
728 __be32 daddr = iph->daddr;
729 __be32 saddr = iph->saddr;
730
731 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
732 " Advised path = %pI4 -> %pI4\n",
733 &old_gw, dev->name, &new_gw,
734 &saddr, &daddr);
735 }
736#endif
737 ;
738}
739
740static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
741{
742 struct rtable *rt;
743 struct flowi4 fl4;
744
745 rt = (struct rtable *) dst;
746
747 ip_rt_build_flow_key(&fl4, sk, skb);
748 __ip_do_redirect(rt, skb, &fl4, true);
749}
750
751static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
752{
753 struct rtable *rt = (struct rtable *)dst;
754 struct dst_entry *ret = dst;
755
756 if (rt) {
757 if (dst->obsolete > 0) {
758 ip_rt_put(rt);
759 ret = NULL;
760 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
761 rt->dst.expires) {
762 ip_rt_put(rt);
763 ret = NULL;
764 }
765 }
766 return ret;
767}
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785void ip_rt_send_redirect(struct sk_buff *skb)
786{
787 struct rtable *rt = skb_rtable(skb);
788 struct in_device *in_dev;
789 struct inet_peer *peer;
790 struct net *net;
791 int log_martians;
792
793 rcu_read_lock();
794 in_dev = __in_dev_get_rcu(rt->dst.dev);
795 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
796 rcu_read_unlock();
797 return;
798 }
799 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
800 rcu_read_unlock();
801
802 net = dev_net(rt->dst.dev);
803 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
804 if (!peer) {
805 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
806 rt_nexthop(rt, ip_hdr(skb)->daddr));
807 return;
808 }
809
810
811
812
813 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
814 peer->rate_tokens = 0;
815
816
817
818
819 if (peer->rate_tokens >= ip_rt_redirect_number) {
820 peer->rate_last = jiffies;
821 goto out_put_peer;
822 }
823
824
825
826
827 if (peer->rate_tokens == 0 ||
828 time_after(jiffies,
829 (peer->rate_last +
830 (ip_rt_redirect_load << peer->rate_tokens)))) {
831 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
832
833 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
834 peer->rate_last = jiffies;
835 ++peer->rate_tokens;
836#ifdef CONFIG_IP_ROUTE_VERBOSE
837 if (log_martians &&
838 peer->rate_tokens == ip_rt_redirect_number)
839 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
840 &ip_hdr(skb)->saddr, inet_iif(skb),
841 &ip_hdr(skb)->daddr, &gw);
842#endif
843 }
844out_put_peer:
845 inet_putpeer(peer);
846}
847
848static int ip_error(struct sk_buff *skb)
849{
850 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
851 struct rtable *rt = skb_rtable(skb);
852 struct inet_peer *peer;
853 unsigned long now;
854 struct net *net;
855 bool send;
856 int code;
857
858 net = dev_net(rt->dst.dev);
859 if (!IN_DEV_FORWARD(in_dev)) {
860 switch (rt->dst.error) {
861 case EHOSTUNREACH:
862 IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
863 break;
864
865 case ENETUNREACH:
866 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
867 break;
868 }
869 goto out;
870 }
871
872 switch (rt->dst.error) {
873 case EINVAL:
874 default:
875 goto out;
876 case EHOSTUNREACH:
877 code = ICMP_HOST_UNREACH;
878 break;
879 case ENETUNREACH:
880 code = ICMP_NET_UNREACH;
881 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
882 break;
883 case EACCES:
884 code = ICMP_PKT_FILTERED;
885 break;
886 }
887
888 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
889
890 send = true;
891 if (peer) {
892 now = jiffies;
893 peer->rate_tokens += now - peer->rate_last;
894 if (peer->rate_tokens > ip_rt_error_burst)
895 peer->rate_tokens = ip_rt_error_burst;
896 peer->rate_last = now;
897 if (peer->rate_tokens >= ip_rt_error_cost)
898 peer->rate_tokens -= ip_rt_error_cost;
899 else
900 send = false;
901 inet_putpeer(peer);
902 }
903 if (send)
904 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
905
906out: kfree_skb(skb);
907 return 0;
908}
909
910static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
911{
912 struct dst_entry *dst = &rt->dst;
913 struct fib_result res;
914
915 if (dst->dev->mtu < mtu)
916 return;
917
918 if (mtu < ip_rt_min_pmtu)
919 mtu = ip_rt_min_pmtu;
920
921 if (!rt->rt_pmtu) {
922 dst->obsolete = DST_OBSOLETE_KILL;
923 } else {
924 rt->rt_pmtu = mtu;
925 dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
926 }
927
928 rcu_read_lock();
929 if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
930 struct fib_nh *nh = &FIB_RES_NH(res);
931
932 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
933 jiffies + ip_rt_mtu_expires);
934 }
935 rcu_read_unlock();
936}
937
938static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
939 struct sk_buff *skb, u32 mtu)
940{
941 struct rtable *rt = (struct rtable *) dst;
942 struct flowi4 fl4;
943
944 ip_rt_build_flow_key(&fl4, sk, skb);
945 __ip_rt_update_pmtu(rt, &fl4, mtu);
946}
947
948void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
949 int oif, u32 mark, u8 protocol, int flow_flags)
950{
951 const struct iphdr *iph = (const struct iphdr *) skb->data;
952 struct flowi4 fl4;
953 struct rtable *rt;
954
955 __build_flow_key(&fl4, NULL, iph, oif,
956 RT_TOS(iph->tos), protocol, mark, flow_flags);
957 rt = __ip_route_output_key(net, &fl4);
958 if (!IS_ERR(rt)) {
959 __ip_rt_update_pmtu(rt, &fl4, mtu);
960 ip_rt_put(rt);
961 }
962}
963EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
964
965void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
966{
967 const struct iphdr *iph = (const struct iphdr *) skb->data;
968 struct flowi4 fl4;
969 struct rtable *rt;
970
971 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
972 rt = __ip_route_output_key(sock_net(sk), &fl4);
973 if (!IS_ERR(rt)) {
974 __ip_rt_update_pmtu(rt, &fl4, mtu);
975 ip_rt_put(rt);
976 }
977}
978EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
979
980void ipv4_redirect(struct sk_buff *skb, struct net *net,
981 int oif, u32 mark, u8 protocol, int flow_flags)
982{
983 const struct iphdr *iph = (const struct iphdr *) skb->data;
984 struct flowi4 fl4;
985 struct rtable *rt;
986
987 __build_flow_key(&fl4, NULL, iph, oif,
988 RT_TOS(iph->tos), protocol, mark, flow_flags);
989 rt = __ip_route_output_key(net, &fl4);
990 if (!IS_ERR(rt)) {
991 __ip_do_redirect(rt, skb, &fl4, false);
992 ip_rt_put(rt);
993 }
994}
995EXPORT_SYMBOL_GPL(ipv4_redirect);
996
997void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
998{
999 const struct iphdr *iph = (const struct iphdr *) skb->data;
1000 struct flowi4 fl4;
1001 struct rtable *rt;
1002
1003 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1004 rt = __ip_route_output_key(sock_net(sk), &fl4);
1005 if (!IS_ERR(rt)) {
1006 __ip_do_redirect(rt, skb, &fl4, false);
1007 ip_rt_put(rt);
1008 }
1009}
1010EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1011
1012static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1013{
1014 struct rtable *rt = (struct rtable *) dst;
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024 if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1025 return NULL;
1026 return dst;
1027}
1028
1029static void ipv4_link_failure(struct sk_buff *skb)
1030{
1031 struct rtable *rt;
1032
1033 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1034
1035 rt = skb_rtable(skb);
1036 if (rt)
1037 dst_set_expires(&rt->dst, 0);
1038}
1039
1040static int ip_rt_bug(struct sk_buff *skb)
1041{
1042 pr_debug("%s: %pI4 -> %pI4, %s\n",
1043 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1044 skb->dev ? skb->dev->name : "?");
1045 kfree_skb(skb);
1046 WARN_ON(1);
1047 return 0;
1048}
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1060{
1061 __be32 src;
1062
1063 if (rt_is_output_route(rt))
1064 src = ip_hdr(skb)->saddr;
1065 else {
1066 struct fib_result res;
1067 struct flowi4 fl4;
1068 struct iphdr *iph;
1069
1070 iph = ip_hdr(skb);
1071
1072 memset(&fl4, 0, sizeof(fl4));
1073 fl4.daddr = iph->daddr;
1074 fl4.saddr = iph->saddr;
1075 fl4.flowi4_tos = RT_TOS(iph->tos);
1076 fl4.flowi4_oif = rt->dst.dev->ifindex;
1077 fl4.flowi4_iif = skb->dev->ifindex;
1078 fl4.flowi4_mark = skb->mark;
1079
1080 rcu_read_lock();
1081 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1082 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1083 else
1084 src = inet_select_addr(rt->dst.dev,
1085 rt_nexthop(rt, iph->daddr),
1086 RT_SCOPE_UNIVERSE);
1087 rcu_read_unlock();
1088 }
1089 memcpy(addr, &src, 4);
1090}
1091
1092#ifdef CONFIG_IP_ROUTE_CLASSID
1093static void set_class_tag(struct rtable *rt, u32 tag)
1094{
1095 if (!(rt->dst.tclassid & 0xFFFF))
1096 rt->dst.tclassid |= tag & 0xFFFF;
1097 if (!(rt->dst.tclassid & 0xFFFF0000))
1098 rt->dst.tclassid |= tag & 0xFFFF0000;
1099}
1100#endif
1101
1102static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1103{
1104 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1105
1106 if (advmss == 0) {
1107 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1108 ip_rt_min_advmss);
1109 if (advmss > 65535 - 40)
1110 advmss = 65535 - 40;
1111 }
1112 return advmss;
1113}
1114
1115static unsigned int ipv4_mtu(const struct dst_entry *dst)
1116{
1117 const struct rtable *rt = (const struct rtable *) dst;
1118 unsigned int mtu = rt->rt_pmtu;
1119
1120 if (mtu && time_after_eq(jiffies, rt->dst.expires))
1121 mtu = 0;
1122
1123 if (!mtu)
1124 mtu = dst_metric_raw(dst, RTAX_MTU);
1125
1126 if (mtu && rt_is_output_route(rt))
1127 return mtu;
1128
1129 mtu = dst->dev->mtu;
1130
1131 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1132 if (rt->rt_uses_gateway && mtu > 576)
1133 mtu = 576;
1134 }
1135
1136 if (mtu > IP_MAX_MTU)
1137 mtu = IP_MAX_MTU;
1138
1139 return mtu;
1140}
1141
1142static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1143{
1144 struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1145 struct fib_nh_exception *fnhe;
1146 u32 hval;
1147
1148 if (!hash)
1149 return NULL;
1150
1151 hval = fnhe_hashfun(daddr);
1152
1153 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1154 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1155 if (fnhe->fnhe_daddr == daddr)
1156 return fnhe;
1157 }
1158 return NULL;
1159}
1160
1161static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1162 __be32 daddr)
1163{
1164 bool ret = false;
1165
1166 spin_lock_bh(&fnhe_lock);
1167
1168 if (daddr == fnhe->fnhe_daddr) {
1169 struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
1170 if (orig && rt_is_expired(orig)) {
1171 fnhe->fnhe_gw = 0;
1172 fnhe->fnhe_pmtu = 0;
1173 fnhe->fnhe_expires = 0;
1174 }
1175 if (fnhe->fnhe_pmtu) {
1176 unsigned long expires = fnhe->fnhe_expires;
1177 unsigned long diff = expires - jiffies;
1178
1179 if (time_before(jiffies, expires)) {
1180 rt->rt_pmtu = fnhe->fnhe_pmtu;
1181 dst_set_expires(&rt->dst, diff);
1182 }
1183 }
1184 if (fnhe->fnhe_gw) {
1185 rt->rt_flags |= RTCF_REDIRECTED;
1186 rt->rt_gateway = fnhe->fnhe_gw;
1187 rt->rt_uses_gateway = 1;
1188 } else if (!rt->rt_gateway)
1189 rt->rt_gateway = daddr;
1190
1191 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1192 if (orig)
1193 rt_free(orig);
1194
1195 fnhe->fnhe_stamp = jiffies;
1196 ret = true;
1197 }
1198 spin_unlock_bh(&fnhe_lock);
1199
1200 return ret;
1201}
1202
1203static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1204{
1205 struct rtable *orig, *prev, **p;
1206 bool ret = true;
1207
1208 if (rt_is_input_route(rt)) {
1209 p = (struct rtable **)&nh->nh_rth_input;
1210 } else {
1211 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1212 }
1213 orig = *p;
1214
1215 prev = cmpxchg(p, orig, rt);
1216 if (prev == orig) {
1217 if (orig)
1218 rt_free(orig);
1219 } else
1220 ret = false;
1221
1222 return ret;
1223}
1224
1225static DEFINE_SPINLOCK(rt_uncached_lock);
1226static LIST_HEAD(rt_uncached_list);
1227
1228static void rt_add_uncached_list(struct rtable *rt)
1229{
1230 spin_lock_bh(&rt_uncached_lock);
1231 list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1232 spin_unlock_bh(&rt_uncached_lock);
1233}
1234
1235static void ipv4_dst_destroy(struct dst_entry *dst)
1236{
1237 struct rtable *rt = (struct rtable *) dst;
1238
1239 if (!list_empty(&rt->rt_uncached)) {
1240 spin_lock_bh(&rt_uncached_lock);
1241 list_del(&rt->rt_uncached);
1242 spin_unlock_bh(&rt_uncached_lock);
1243 }
1244}
1245
1246void rt_flush_dev(struct net_device *dev)
1247{
1248 if (!list_empty(&rt_uncached_list)) {
1249 struct net *net = dev_net(dev);
1250 struct rtable *rt;
1251
1252 spin_lock_bh(&rt_uncached_lock);
1253 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1254 if (rt->dst.dev != dev)
1255 continue;
1256 rt->dst.dev = net->loopback_dev;
1257 dev_hold(rt->dst.dev);
1258 dev_put(dev);
1259 }
1260 spin_unlock_bh(&rt_uncached_lock);
1261 }
1262}
1263
1264static bool rt_cache_valid(const struct rtable *rt)
1265{
1266 return rt &&
1267 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1268 !rt_is_expired(rt);
1269}
1270
1271static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1272 const struct fib_result *res,
1273 struct fib_nh_exception *fnhe,
1274 struct fib_info *fi, u16 type, u32 itag)
1275{
1276 bool cached = false;
1277
1278 if (fi) {
1279 struct fib_nh *nh = &FIB_RES_NH(*res);
1280
1281 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1282 rt->rt_gateway = nh->nh_gw;
1283 rt->rt_uses_gateway = 1;
1284 }
1285 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1286#ifdef CONFIG_IP_ROUTE_CLASSID
1287 rt->dst.tclassid = nh->nh_tclassid;
1288#endif
1289 if (unlikely(fnhe))
1290 cached = rt_bind_exception(rt, fnhe, daddr);
1291 else if (!(rt->dst.flags & DST_NOCACHE))
1292 cached = rt_cache_route(nh, rt);
1293 if (unlikely(!cached)) {
1294
1295
1296
1297
1298
1299 rt->dst.flags |= DST_NOCACHE;
1300 if (!rt->rt_gateway)
1301 rt->rt_gateway = daddr;
1302 rt_add_uncached_list(rt);
1303 }
1304 } else
1305 rt_add_uncached_list(rt);
1306
1307#ifdef CONFIG_IP_ROUTE_CLASSID
1308#ifdef CONFIG_IP_MULTIPLE_TABLES
1309 set_class_tag(rt, res->tclassid);
1310#endif
1311 set_class_tag(rt, itag);
1312#endif
1313}
1314
1315static struct rtable *rt_dst_alloc(struct net_device *dev,
1316 bool nopolicy, bool noxfrm, bool will_cache)
1317{
1318 return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1319 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1320 (nopolicy ? DST_NOPOLICY : 0) |
1321 (noxfrm ? DST_NOXFRM : 0));
1322}
1323
1324
1325static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1326 u8 tos, struct net_device *dev, int our)
1327{
1328 struct rtable *rth;
1329 struct in_device *in_dev = __in_dev_get_rcu(dev);
1330 u32 itag = 0;
1331 int err;
1332
1333
1334
1335 if (in_dev == NULL)
1336 return -EINVAL;
1337
1338 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1339 skb->protocol != htons(ETH_P_IP))
1340 goto e_inval;
1341
1342 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1343 if (ipv4_is_loopback(saddr))
1344 goto e_inval;
1345
1346 if (ipv4_is_zeronet(saddr)) {
1347 if (!ipv4_is_local_multicast(daddr))
1348 goto e_inval;
1349 } else {
1350 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1351 in_dev, &itag);
1352 if (err < 0)
1353 goto e_err;
1354 }
1355 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1356 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1357 if (!rth)
1358 goto e_nobufs;
1359
1360#ifdef CONFIG_IP_ROUTE_CLASSID
1361 rth->dst.tclassid = itag;
1362#endif
1363 rth->dst.output = ip_rt_bug;
1364
1365 rth->rt_genid = rt_genid(dev_net(dev));
1366 rth->rt_flags = RTCF_MULTICAST;
1367 rth->rt_type = RTN_MULTICAST;
1368 rth->rt_is_input= 1;
1369 rth->rt_iif = 0;
1370 rth->rt_pmtu = 0;
1371 rth->rt_gateway = 0;
1372 rth->rt_uses_gateway = 0;
1373 INIT_LIST_HEAD(&rth->rt_uncached);
1374 if (our) {
1375 rth->dst.input= ip_local_deliver;
1376 rth->rt_flags |= RTCF_LOCAL;
1377 }
1378
1379#ifdef CONFIG_IP_MROUTE
1380 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1381 rth->dst.input = ip_mr_input;
1382#endif
1383 RT_CACHE_STAT_INC(in_slow_mc);
1384
1385 skb_dst_set(skb, &rth->dst);
1386 return 0;
1387
1388e_nobufs:
1389 return -ENOBUFS;
1390e_inval:
1391 return -EINVAL;
1392e_err:
1393 return err;
1394}
1395
1396
1397static void ip_handle_martian_source(struct net_device *dev,
1398 struct in_device *in_dev,
1399 struct sk_buff *skb,
1400 __be32 daddr,
1401 __be32 saddr)
1402{
1403 RT_CACHE_STAT_INC(in_martian_src);
1404#ifdef CONFIG_IP_ROUTE_VERBOSE
1405 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1406
1407
1408
1409
1410 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1411 &daddr, &saddr, dev->name);
1412 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1413 print_hex_dump(KERN_WARNING, "ll header: ",
1414 DUMP_PREFIX_OFFSET, 16, 1,
1415 skb_mac_header(skb),
1416 dev->hard_header_len, true);
1417 }
1418 }
1419#endif
1420}
1421
1422
1423static int __mkroute_input(struct sk_buff *skb,
1424 const struct fib_result *res,
1425 struct in_device *in_dev,
1426 __be32 daddr, __be32 saddr, u32 tos)
1427{
1428 struct rtable *rth;
1429 int err;
1430 struct in_device *out_dev;
1431 unsigned int flags = 0;
1432 bool do_cache;
1433 u32 itag;
1434
1435
1436 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1437 if (out_dev == NULL) {
1438 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1439 return -EINVAL;
1440 }
1441
1442 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1443 in_dev->dev, in_dev, &itag);
1444 if (err < 0) {
1445 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1446 saddr);
1447
1448 goto cleanup;
1449 }
1450
1451 do_cache = res->fi && !itag;
1452 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1453 (IN_DEV_SHARED_MEDIA(out_dev) ||
1454 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1455 flags |= RTCF_DOREDIRECT;
1456 do_cache = false;
1457 }
1458
1459 if (skb->protocol != htons(ETH_P_IP)) {
1460
1461
1462
1463
1464
1465
1466
1467 if (out_dev == in_dev &&
1468 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1469 err = -EINVAL;
1470 goto cleanup;
1471 }
1472 }
1473
1474 if (do_cache) {
1475 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1476 if (rt_cache_valid(rth)) {
1477 skb_dst_set_noref(skb, &rth->dst);
1478 goto out;
1479 }
1480 }
1481
1482 rth = rt_dst_alloc(out_dev->dev,
1483 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1484 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1485 if (!rth) {
1486 err = -ENOBUFS;
1487 goto cleanup;
1488 }
1489
1490 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1491 rth->rt_flags = flags;
1492 rth->rt_type = res->type;
1493 rth->rt_is_input = 1;
1494 rth->rt_iif = 0;
1495 rth->rt_pmtu = 0;
1496 rth->rt_gateway = 0;
1497 rth->rt_uses_gateway = 0;
1498 INIT_LIST_HEAD(&rth->rt_uncached);
1499
1500 rth->dst.input = ip_forward;
1501 rth->dst.output = ip_output;
1502
1503 rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1504 skb_dst_set(skb, &rth->dst);
1505out:
1506 err = 0;
1507 cleanup:
1508 return err;
1509}
1510
1511static int ip_mkroute_input(struct sk_buff *skb,
1512 struct fib_result *res,
1513 const struct flowi4 *fl4,
1514 struct in_device *in_dev,
1515 __be32 daddr, __be32 saddr, u32 tos)
1516{
1517#ifdef CONFIG_IP_ROUTE_MULTIPATH
1518 if (res->fi && res->fi->fib_nhs > 1)
1519 fib_select_multipath(res);
1520#endif
1521
1522
1523 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1524}
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1538 u8 tos, struct net_device *dev)
1539{
1540 struct fib_result res;
1541 struct in_device *in_dev = __in_dev_get_rcu(dev);
1542 struct flowi4 fl4;
1543 unsigned int flags = 0;
1544 u32 itag = 0;
1545 struct rtable *rth;
1546 int err = -EINVAL;
1547 struct net *net = dev_net(dev);
1548 bool do_cache;
1549
1550
1551
1552 if (!in_dev)
1553 goto out;
1554
1555
1556
1557
1558
1559 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1560 goto martian_source;
1561
1562 res.fi = NULL;
1563 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1564 goto brd_input;
1565
1566
1567
1568
1569 if (ipv4_is_zeronet(saddr))
1570 goto martian_source;
1571
1572 if (ipv4_is_zeronet(daddr))
1573 goto martian_destination;
1574
1575 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
1576 if (ipv4_is_loopback(daddr))
1577 goto martian_destination;
1578
1579 if (ipv4_is_loopback(saddr))
1580 goto martian_source;
1581 }
1582
1583
1584
1585
1586 fl4.flowi4_oif = 0;
1587 fl4.flowi4_iif = dev->ifindex;
1588 fl4.flowi4_mark = skb->mark;
1589 fl4.flowi4_tos = tos;
1590 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1591 fl4.daddr = daddr;
1592 fl4.saddr = saddr;
1593 err = fib_lookup(net, &fl4, &res);
1594 if (err != 0)
1595 goto no_route;
1596
1597 RT_CACHE_STAT_INC(in_slow_tot);
1598
1599 if (res.type == RTN_BROADCAST)
1600 goto brd_input;
1601
1602 if (res.type == RTN_LOCAL) {
1603 err = fib_validate_source(skb, saddr, daddr, tos,
1604 net->loopback_dev->ifindex,
1605 dev, in_dev, &itag);
1606 if (err < 0)
1607 goto martian_source_keep_err;
1608 goto local_input;
1609 }
1610
1611 if (!IN_DEV_FORWARD(in_dev))
1612 goto no_route;
1613 if (res.type != RTN_UNICAST)
1614 goto martian_destination;
1615
1616 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1617out: return err;
1618
1619brd_input:
1620 if (skb->protocol != htons(ETH_P_IP))
1621 goto e_inval;
1622
1623 if (!ipv4_is_zeronet(saddr)) {
1624 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1625 in_dev, &itag);
1626 if (err < 0)
1627 goto martian_source_keep_err;
1628 }
1629 flags |= RTCF_BROADCAST;
1630 res.type = RTN_BROADCAST;
1631 RT_CACHE_STAT_INC(in_brd);
1632
1633local_input:
1634 do_cache = false;
1635 if (res.fi) {
1636 if (!itag) {
1637 rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1638 if (rt_cache_valid(rth)) {
1639 skb_dst_set_noref(skb, &rth->dst);
1640 err = 0;
1641 goto out;
1642 }
1643 do_cache = true;
1644 }
1645 }
1646
1647 rth = rt_dst_alloc(net->loopback_dev,
1648 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1649 if (!rth)
1650 goto e_nobufs;
1651
1652 rth->dst.input= ip_local_deliver;
1653 rth->dst.output= ip_rt_bug;
1654#ifdef CONFIG_IP_ROUTE_CLASSID
1655 rth->dst.tclassid = itag;
1656#endif
1657
1658 rth->rt_genid = rt_genid(net);
1659 rth->rt_flags = flags|RTCF_LOCAL;
1660 rth->rt_type = res.type;
1661 rth->rt_is_input = 1;
1662 rth->rt_iif = 0;
1663 rth->rt_pmtu = 0;
1664 rth->rt_gateway = 0;
1665 rth->rt_uses_gateway = 0;
1666 INIT_LIST_HEAD(&rth->rt_uncached);
1667 if (res.type == RTN_UNREACHABLE) {
1668 rth->dst.input= ip_error;
1669 rth->dst.error= -err;
1670 rth->rt_flags &= ~RTCF_LOCAL;
1671 }
1672 if (do_cache)
1673 rt_cache_route(&FIB_RES_NH(res), rth);
1674 skb_dst_set(skb, &rth->dst);
1675 err = 0;
1676 goto out;
1677
1678no_route:
1679 RT_CACHE_STAT_INC(in_no_route);
1680 res.type = RTN_UNREACHABLE;
1681 if (err == -ESRCH)
1682 err = -ENETUNREACH;
1683 goto local_input;
1684
1685
1686
1687
1688martian_destination:
1689 RT_CACHE_STAT_INC(in_martian_dst);
1690#ifdef CONFIG_IP_ROUTE_VERBOSE
1691 if (IN_DEV_LOG_MARTIANS(in_dev))
1692 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1693 &daddr, &saddr, dev->name);
1694#endif
1695
1696e_inval:
1697 err = -EINVAL;
1698 goto out;
1699
1700e_nobufs:
1701 err = -ENOBUFS;
1702 goto out;
1703
1704martian_source:
1705 err = -EINVAL;
1706martian_source_keep_err:
1707 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1708 goto out;
1709}
1710
1711int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1712 u8 tos, struct net_device *dev)
1713{
1714 int res;
1715
1716 rcu_read_lock();
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729 if (ipv4_is_multicast(daddr)) {
1730 struct in_device *in_dev = __in_dev_get_rcu(dev);
1731
1732 if (in_dev) {
1733 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1734 ip_hdr(skb)->protocol);
1735 if (our
1736#ifdef CONFIG_IP_MROUTE
1737 ||
1738 (!ipv4_is_local_multicast(daddr) &&
1739 IN_DEV_MFORWARD(in_dev))
1740#endif
1741 ) {
1742 int res = ip_route_input_mc(skb, daddr, saddr,
1743 tos, dev, our);
1744 rcu_read_unlock();
1745 return res;
1746 }
1747 }
1748 rcu_read_unlock();
1749 return -EINVAL;
1750 }
1751 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1752 rcu_read_unlock();
1753 return res;
1754}
1755EXPORT_SYMBOL(ip_route_input_noref);
1756
1757
1758static struct rtable *__mkroute_output(const struct fib_result *res,
1759 const struct flowi4 *fl4, int orig_oif,
1760 struct net_device *dev_out,
1761 unsigned int flags)
1762{
1763 struct fib_info *fi = res->fi;
1764 struct fib_nh_exception *fnhe;
1765 struct in_device *in_dev;
1766 u16 type = res->type;
1767 struct rtable *rth;
1768 bool do_cache;
1769
1770 in_dev = __in_dev_get_rcu(dev_out);
1771 if (!in_dev)
1772 return ERR_PTR(-EINVAL);
1773
1774 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1775 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1776 return ERR_PTR(-EINVAL);
1777
1778 if (ipv4_is_lbcast(fl4->daddr))
1779 type = RTN_BROADCAST;
1780 else if (ipv4_is_multicast(fl4->daddr))
1781 type = RTN_MULTICAST;
1782 else if (ipv4_is_zeronet(fl4->daddr))
1783 return ERR_PTR(-EINVAL);
1784
1785 if (dev_out->flags & IFF_LOOPBACK)
1786 flags |= RTCF_LOCAL;
1787
1788 if (type == RTN_BROADCAST) {
1789 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1790 fi = NULL;
1791 } else if (type == RTN_MULTICAST) {
1792 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1793 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1794 fl4->flowi4_proto))
1795 flags &= ~RTCF_LOCAL;
1796
1797
1798
1799
1800 if (fi && res->prefixlen < 4)
1801 fi = NULL;
1802 }
1803
1804 fnhe = NULL;
1805 do_cache = fi != NULL;
1806 if (fi) {
1807 struct rtable __rcu **prth;
1808 struct fib_nh *nh = &FIB_RES_NH(*res);
1809
1810 fnhe = find_exception(nh, fl4->daddr);
1811 if (fnhe)
1812 prth = &fnhe->fnhe_rth;
1813 else {
1814 if (unlikely(fl4->flowi4_flags &
1815 FLOWI_FLAG_KNOWN_NH &&
1816 !(nh->nh_gw &&
1817 nh->nh_scope == RT_SCOPE_LINK))) {
1818 do_cache = false;
1819 goto add;
1820 }
1821 prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1822 }
1823 rth = rcu_dereference(*prth);
1824 if (rt_cache_valid(rth)) {
1825 dst_hold(&rth->dst);
1826 return rth;
1827 }
1828 }
1829
1830add:
1831 rth = rt_dst_alloc(dev_out,
1832 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1833 IN_DEV_CONF_GET(in_dev, NOXFRM),
1834 do_cache);
1835 if (!rth)
1836 return ERR_PTR(-ENOBUFS);
1837
1838 rth->dst.output = ip_output;
1839
1840 rth->rt_genid = rt_genid(dev_net(dev_out));
1841 rth->rt_flags = flags;
1842 rth->rt_type = type;
1843 rth->rt_is_input = 0;
1844 rth->rt_iif = orig_oif ? : 0;
1845 rth->rt_pmtu = 0;
1846 rth->rt_gateway = 0;
1847 rth->rt_uses_gateway = 0;
1848 INIT_LIST_HEAD(&rth->rt_uncached);
1849
1850 RT_CACHE_STAT_INC(out_slow_tot);
1851
1852 if (flags & RTCF_LOCAL)
1853 rth->dst.input = ip_local_deliver;
1854 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1855 if (flags & RTCF_LOCAL &&
1856 !(dev_out->flags & IFF_LOOPBACK)) {
1857 rth->dst.output = ip_mc_output;
1858 RT_CACHE_STAT_INC(out_slow_mc);
1859 }
1860#ifdef CONFIG_IP_MROUTE
1861 if (type == RTN_MULTICAST) {
1862 if (IN_DEV_MFORWARD(in_dev) &&
1863 !ipv4_is_local_multicast(fl4->daddr)) {
1864 rth->dst.input = ip_mr_input;
1865 rth->dst.output = ip_mc_output;
1866 }
1867 }
1868#endif
1869 }
1870
1871 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1872
1873 return rth;
1874}
1875
1876
1877
1878
1879
1880struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1881{
1882 struct net_device *dev_out = NULL;
1883 __u8 tos = RT_FL_TOS(fl4);
1884 unsigned int flags = 0;
1885 struct fib_result res;
1886 struct rtable *rth;
1887 int orig_oif;
1888
1889 res.tclassid = 0;
1890 res.fi = NULL;
1891 res.table = NULL;
1892
1893 orig_oif = fl4->flowi4_oif;
1894
1895 fl4->flowi4_iif = net->loopback_dev->ifindex;
1896 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1897 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1898 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1899
1900 rcu_read_lock();
1901 if (fl4->saddr) {
1902 rth = ERR_PTR(-EINVAL);
1903 if (ipv4_is_multicast(fl4->saddr) ||
1904 ipv4_is_lbcast(fl4->saddr) ||
1905 ipv4_is_zeronet(fl4->saddr))
1906 goto out;
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916 if (fl4->flowi4_oif == 0 &&
1917 (ipv4_is_multicast(fl4->daddr) ||
1918 ipv4_is_lbcast(fl4->daddr))) {
1919
1920 dev_out = __ip_dev_find(net, fl4->saddr, false);
1921 if (dev_out == NULL)
1922 goto out;
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939 fl4->flowi4_oif = dev_out->ifindex;
1940 goto make_route;
1941 }
1942
1943 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1944
1945 if (!__ip_dev_find(net, fl4->saddr, false))
1946 goto out;
1947 }
1948 }
1949
1950
1951 if (fl4->flowi4_oif) {
1952 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
1953 rth = ERR_PTR(-ENODEV);
1954 if (dev_out == NULL)
1955 goto out;
1956
1957
1958 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
1959 rth = ERR_PTR(-ENETUNREACH);
1960 goto out;
1961 }
1962 if (ipv4_is_local_multicast(fl4->daddr) ||
1963 ipv4_is_lbcast(fl4->daddr)) {
1964 if (!fl4->saddr)
1965 fl4->saddr = inet_select_addr(dev_out, 0,
1966 RT_SCOPE_LINK);
1967 goto make_route;
1968 }
1969 if (fl4->saddr) {
1970 if (ipv4_is_multicast(fl4->daddr))
1971 fl4->saddr = inet_select_addr(dev_out, 0,
1972 fl4->flowi4_scope);
1973 else if (!fl4->daddr)
1974 fl4->saddr = inet_select_addr(dev_out, 0,
1975 RT_SCOPE_HOST);
1976 }
1977 }
1978
1979 if (!fl4->daddr) {
1980 fl4->daddr = fl4->saddr;
1981 if (!fl4->daddr)
1982 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1983 dev_out = net->loopback_dev;
1984 fl4->flowi4_oif = net->loopback_dev->ifindex;
1985 res.type = RTN_LOCAL;
1986 flags |= RTCF_LOCAL;
1987 goto make_route;
1988 }
1989
1990 if (fib_lookup(net, fl4, &res)) {
1991 res.fi = NULL;
1992 res.table = NULL;
1993 if (fl4->flowi4_oif) {
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012 if (fl4->saddr == 0)
2013 fl4->saddr = inet_select_addr(dev_out, 0,
2014 RT_SCOPE_LINK);
2015 res.type = RTN_UNICAST;
2016 goto make_route;
2017 }
2018 rth = ERR_PTR(-ENETUNREACH);
2019 goto out;
2020 }
2021
2022 if (res.type == RTN_LOCAL) {
2023 if (!fl4->saddr) {
2024 if (res.fi->fib_prefsrc)
2025 fl4->saddr = res.fi->fib_prefsrc;
2026 else
2027 fl4->saddr = fl4->daddr;
2028 }
2029 dev_out = net->loopback_dev;
2030 fl4->flowi4_oif = dev_out->ifindex;
2031 flags |= RTCF_LOCAL;
2032 goto make_route;
2033 }
2034
2035#ifdef CONFIG_IP_ROUTE_MULTIPATH
2036 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2037 fib_select_multipath(&res);
2038 else
2039#endif
2040 if (!res.prefixlen &&
2041 res.table->tb_num_default > 1 &&
2042 res.type == RTN_UNICAST && !fl4->flowi4_oif)
2043 fib_select_default(&res);
2044
2045 if (!fl4->saddr)
2046 fl4->saddr = FIB_RES_PREFSRC(net, res);
2047
2048 dev_out = FIB_RES_DEV(res);
2049 fl4->flowi4_oif = dev_out->ifindex;
2050
2051
2052make_route:
2053 rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2054
2055out:
2056 rcu_read_unlock();
2057 return rth;
2058}
2059EXPORT_SYMBOL_GPL(__ip_route_output_key);
2060
2061static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2062{
2063 return NULL;
2064}
2065
2066static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2067{
2068 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2069
2070 return mtu ? : dst->dev->mtu;
2071}
2072
2073static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2074 struct sk_buff *skb, u32 mtu)
2075{
2076}
2077
2078static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2079 struct sk_buff *skb)
2080{
2081}
2082
2083static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2084 unsigned long old)
2085{
2086 return NULL;
2087}
2088
2089static struct dst_ops ipv4_dst_blackhole_ops = {
2090 .family = AF_INET,
2091 .protocol = cpu_to_be16(ETH_P_IP),
2092 .check = ipv4_blackhole_dst_check,
2093 .mtu = ipv4_blackhole_mtu,
2094 .default_advmss = ipv4_default_advmss,
2095 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2096 .redirect = ipv4_rt_blackhole_redirect,
2097 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2098 .neigh_lookup = ipv4_neigh_lookup,
2099};
2100
2101struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2102{
2103 struct rtable *ort = (struct rtable *) dst_orig;
2104 struct rtable *rt;
2105
2106 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2107 if (rt) {
2108 struct dst_entry *new = &rt->dst;
2109
2110 new->__use = 1;
2111 new->input = dst_discard;
2112 new->output = dst_discard;
2113
2114 new->dev = ort->dst.dev;
2115 if (new->dev)
2116 dev_hold(new->dev);
2117
2118 rt->rt_is_input = ort->rt_is_input;
2119 rt->rt_iif = ort->rt_iif;
2120 rt->rt_pmtu = ort->rt_pmtu;
2121
2122 rt->rt_genid = rt_genid(net);
2123 rt->rt_flags = ort->rt_flags;
2124 rt->rt_type = ort->rt_type;
2125 rt->rt_gateway = ort->rt_gateway;
2126 rt->rt_uses_gateway = ort->rt_uses_gateway;
2127
2128 INIT_LIST_HEAD(&rt->rt_uncached);
2129
2130 dst_free(new);
2131 }
2132
2133 dst_release(dst_orig);
2134
2135 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2136}
2137
2138struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2139 struct sock *sk)
2140{
2141 struct rtable *rt = __ip_route_output_key(net, flp4);
2142
2143 if (IS_ERR(rt))
2144 return rt;
2145
2146 if (flp4->flowi4_proto)
2147 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2148 flowi4_to_flowi(flp4),
2149 sk, 0);
2150
2151 return rt;
2152}
2153EXPORT_SYMBOL_GPL(ip_route_output_flow);
2154
2155static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2156 struct flowi4 *fl4, struct sk_buff *skb, u32 pid,
2157 u32 seq, int event, int nowait, unsigned int flags)
2158{
2159 struct rtable *rt = skb_rtable(skb);
2160 struct rtmsg *r;
2161 struct nlmsghdr *nlh;
2162 unsigned long expires = 0;
2163 u32 error;
2164 u32 metrics[RTAX_MAX];
2165
2166 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2167 if (nlh == NULL)
2168 return -EMSGSIZE;
2169
2170 r = nlmsg_data(nlh);
2171 r->rtm_family = AF_INET;
2172 r->rtm_dst_len = 32;
2173 r->rtm_src_len = 0;
2174 r->rtm_tos = fl4->flowi4_tos;
2175 r->rtm_table = RT_TABLE_MAIN;
2176 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2177 goto nla_put_failure;
2178 r->rtm_type = rt->rt_type;
2179 r->rtm_scope = RT_SCOPE_UNIVERSE;
2180 r->rtm_protocol = RTPROT_UNSPEC;
2181 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2182 if (rt->rt_flags & RTCF_NOTIFY)
2183 r->rtm_flags |= RTM_F_NOTIFY;
2184
2185 if (nla_put_be32(skb, RTA_DST, dst))
2186 goto nla_put_failure;
2187 if (src) {
2188 r->rtm_src_len = 32;
2189 if (nla_put_be32(skb, RTA_SRC, src))
2190 goto nla_put_failure;
2191 }
2192 if (rt->dst.dev &&
2193 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2194 goto nla_put_failure;
2195#ifdef CONFIG_IP_ROUTE_CLASSID
2196 if (rt->dst.tclassid &&
2197 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2198 goto nla_put_failure;
2199#endif
2200 if (!rt_is_input_route(rt) &&
2201 fl4->saddr != src) {
2202 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2203 goto nla_put_failure;
2204 }
2205 if (rt->rt_uses_gateway &&
2206 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2207 goto nla_put_failure;
2208
2209 expires = rt->dst.expires;
2210 if (expires) {
2211 unsigned long now = jiffies;
2212
2213 if (time_before(now, expires))
2214 expires -= now;
2215 else
2216 expires = 0;
2217 }
2218
2219 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2220 if (rt->rt_pmtu && expires)
2221 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2222 if (rtnetlink_put_metrics(skb, metrics) < 0)
2223 goto nla_put_failure;
2224
2225 if (fl4->flowi4_mark &&
2226 nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
2227 goto nla_put_failure;
2228
2229 error = rt->dst.error;
2230
2231 if (rt_is_input_route(rt)) {
2232 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2233 goto nla_put_failure;
2234 }
2235
2236 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2237 goto nla_put_failure;
2238
2239 return nlmsg_end(skb, nlh);
2240
2241nla_put_failure:
2242 nlmsg_cancel(skb, nlh);
2243 return -EMSGSIZE;
2244}
2245
2246static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2247{
2248 struct net *net = sock_net(in_skb->sk);
2249 struct rtmsg *rtm;
2250 struct nlattr *tb[RTA_MAX+1];
2251 struct rtable *rt = NULL;
2252 struct flowi4 fl4;
2253 __be32 dst = 0;
2254 __be32 src = 0;
2255 u32 iif;
2256 int err;
2257 int mark;
2258 struct sk_buff *skb;
2259
2260 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2261 if (err < 0)
2262 goto errout;
2263
2264 rtm = nlmsg_data(nlh);
2265
2266 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2267 if (skb == NULL) {
2268 err = -ENOBUFS;
2269 goto errout;
2270 }
2271
2272
2273
2274
2275 skb_reset_mac_header(skb);
2276 skb_reset_network_header(skb);
2277
2278
2279 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2280 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2281
2282 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2283 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2284 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2285 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2286
2287 memset(&fl4, 0, sizeof(fl4));
2288 fl4.daddr = dst;
2289 fl4.saddr = src;
2290 fl4.flowi4_tos = rtm->rtm_tos;
2291 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2292 fl4.flowi4_mark = mark;
2293
2294 if (iif) {
2295 struct net_device *dev;
2296
2297 dev = __dev_get_by_index(net, iif);
2298 if (dev == NULL) {
2299 err = -ENODEV;
2300 goto errout_free;
2301 }
2302
2303 skb->protocol = htons(ETH_P_IP);
2304 skb->dev = dev;
2305 skb->mark = mark;
2306 local_bh_disable();
2307 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2308 local_bh_enable();
2309
2310 rt = skb_rtable(skb);
2311 if (err == 0 && rt->dst.error)
2312 err = -rt->dst.error;
2313 } else {
2314 rt = ip_route_output_key(net, &fl4);
2315
2316 err = 0;
2317 if (IS_ERR(rt))
2318 err = PTR_ERR(rt);
2319 }
2320
2321 if (err)
2322 goto errout_free;
2323
2324 skb_dst_set(skb, &rt->dst);
2325 if (rtm->rtm_flags & RTM_F_NOTIFY)
2326 rt->rt_flags |= RTCF_NOTIFY;
2327
2328 err = rt_fill_info(net, dst, src, &fl4, skb,
2329 NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2330 RTM_NEWROUTE, 0, 0);
2331 if (err <= 0)
2332 goto errout_free;
2333
2334 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2335errout:
2336 return err;
2337
2338errout_free:
2339 kfree_skb(skb);
2340 goto errout;
2341}
2342
2343int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2344{
2345 return skb->len;
2346}
2347
2348void ip_rt_multicast_event(struct in_device *in_dev)
2349{
2350 rt_cache_flush(dev_net(in_dev->dev));
2351}
2352
2353#ifdef CONFIG_SYSCTL
2354static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2355 void __user *buffer,
2356 size_t *lenp, loff_t *ppos)
2357{
2358 if (write) {
2359 rt_cache_flush((struct net *)__ctl->extra1);
2360 return 0;
2361 }
2362
2363 return -EINVAL;
2364}
2365
2366static ctl_table ipv4_route_table[] = {
2367 {
2368 .procname = "gc_thresh",
2369 .data = &ipv4_dst_ops.gc_thresh,
2370 .maxlen = sizeof(int),
2371 .mode = 0644,
2372 .proc_handler = proc_dointvec,
2373 },
2374 {
2375 .procname = "max_size",
2376 .data = &ip_rt_max_size,
2377 .maxlen = sizeof(int),
2378 .mode = 0644,
2379 .proc_handler = proc_dointvec,
2380 },
2381 {
2382
2383
2384 .procname = "gc_min_interval",
2385 .data = &ip_rt_gc_min_interval,
2386 .maxlen = sizeof(int),
2387 .mode = 0644,
2388 .proc_handler = proc_dointvec_jiffies,
2389 },
2390 {
2391 .procname = "gc_min_interval_ms",
2392 .data = &ip_rt_gc_min_interval,
2393 .maxlen = sizeof(int),
2394 .mode = 0644,
2395 .proc_handler = proc_dointvec_ms_jiffies,
2396 },
2397 {
2398 .procname = "gc_timeout",
2399 .data = &ip_rt_gc_timeout,
2400 .maxlen = sizeof(int),
2401 .mode = 0644,
2402 .proc_handler = proc_dointvec_jiffies,
2403 },
2404 {
2405 .procname = "gc_interval",
2406 .data = &ip_rt_gc_interval,
2407 .maxlen = sizeof(int),
2408 .mode = 0644,
2409 .proc_handler = proc_dointvec_jiffies,
2410 },
2411 {
2412 .procname = "redirect_load",
2413 .data = &ip_rt_redirect_load,
2414 .maxlen = sizeof(int),
2415 .mode = 0644,
2416 .proc_handler = proc_dointvec,
2417 },
2418 {
2419 .procname = "redirect_number",
2420 .data = &ip_rt_redirect_number,
2421 .maxlen = sizeof(int),
2422 .mode = 0644,
2423 .proc_handler = proc_dointvec,
2424 },
2425 {
2426 .procname = "redirect_silence",
2427 .data = &ip_rt_redirect_silence,
2428 .maxlen = sizeof(int),
2429 .mode = 0644,
2430 .proc_handler = proc_dointvec,
2431 },
2432 {
2433 .procname = "error_cost",
2434 .data = &ip_rt_error_cost,
2435 .maxlen = sizeof(int),
2436 .mode = 0644,
2437 .proc_handler = proc_dointvec,
2438 },
2439 {
2440 .procname = "error_burst",
2441 .data = &ip_rt_error_burst,
2442 .maxlen = sizeof(int),
2443 .mode = 0644,
2444 .proc_handler = proc_dointvec,
2445 },
2446 {
2447 .procname = "gc_elasticity",
2448 .data = &ip_rt_gc_elasticity,
2449 .maxlen = sizeof(int),
2450 .mode = 0644,
2451 .proc_handler = proc_dointvec,
2452 },
2453 {
2454 .procname = "mtu_expires",
2455 .data = &ip_rt_mtu_expires,
2456 .maxlen = sizeof(int),
2457 .mode = 0644,
2458 .proc_handler = proc_dointvec_jiffies,
2459 },
2460 {
2461 .procname = "min_pmtu",
2462 .data = &ip_rt_min_pmtu,
2463 .maxlen = sizeof(int),
2464 .mode = 0644,
2465 .proc_handler = proc_dointvec,
2466 },
2467 {
2468 .procname = "min_adv_mss",
2469 .data = &ip_rt_min_advmss,
2470 .maxlen = sizeof(int),
2471 .mode = 0644,
2472 .proc_handler = proc_dointvec,
2473 },
2474 { }
2475};
2476
2477static struct ctl_table ipv4_route_flush_table[] = {
2478 {
2479 .procname = "flush",
2480 .maxlen = sizeof(int),
2481 .mode = 0200,
2482 .proc_handler = ipv4_sysctl_rtcache_flush,
2483 },
2484 { },
2485};
2486
2487static __net_init int sysctl_route_net_init(struct net *net)
2488{
2489 struct ctl_table *tbl;
2490
2491 tbl = ipv4_route_flush_table;
2492 if (!net_eq(net, &init_net)) {
2493 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2494 if (tbl == NULL)
2495 goto err_dup;
2496 }
2497 tbl[0].extra1 = net;
2498
2499 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2500 if (net->ipv4.route_hdr == NULL)
2501 goto err_reg;
2502 return 0;
2503
2504err_reg:
2505 if (tbl != ipv4_route_flush_table)
2506 kfree(tbl);
2507err_dup:
2508 return -ENOMEM;
2509}
2510
2511static __net_exit void sysctl_route_net_exit(struct net *net)
2512{
2513 struct ctl_table *tbl;
2514
2515 tbl = net->ipv4.route_hdr->ctl_table_arg;
2516 unregister_net_sysctl_table(net->ipv4.route_hdr);
2517 BUG_ON(tbl == ipv4_route_flush_table);
2518 kfree(tbl);
2519}
2520
2521static __net_initdata struct pernet_operations sysctl_route_ops = {
2522 .init = sysctl_route_net_init,
2523 .exit = sysctl_route_net_exit,
2524};
2525#endif
2526
2527static __net_init int rt_genid_init(struct net *net)
2528{
2529 atomic_set(&net->rt_genid, 0);
2530 get_random_bytes(&net->ipv4.dev_addr_genid,
2531 sizeof(net->ipv4.dev_addr_genid));
2532 return 0;
2533}
2534
2535static __net_initdata struct pernet_operations rt_genid_ops = {
2536 .init = rt_genid_init,
2537};
2538
2539static int __net_init ipv4_inetpeer_init(struct net *net)
2540{
2541 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2542
2543 if (!bp)
2544 return -ENOMEM;
2545 inet_peer_base_init(bp);
2546 net->ipv4.peers = bp;
2547 return 0;
2548}
2549
2550static void __net_exit ipv4_inetpeer_exit(struct net *net)
2551{
2552 struct inet_peer_base *bp = net->ipv4.peers;
2553
2554 net->ipv4.peers = NULL;
2555 inetpeer_invalidate_tree(bp);
2556 kfree(bp);
2557}
2558
2559static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2560 .init = ipv4_inetpeer_init,
2561 .exit = ipv4_inetpeer_exit,
2562};
2563
2564#ifdef CONFIG_IP_ROUTE_CLASSID
2565struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2566#endif
2567
2568int __init ip_rt_init(void)
2569{
2570 int rc = 0;
2571
2572#ifdef CONFIG_IP_ROUTE_CLASSID
2573 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2574 if (!ip_rt_acct)
2575 panic("IP: failed to allocate ip_rt_acct\n");
2576#endif
2577
2578 ipv4_dst_ops.kmem_cachep =
2579 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2580 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2581
2582 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2583
2584 if (dst_entries_init(&ipv4_dst_ops) < 0)
2585 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2586
2587 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2588 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2589
2590 ipv4_dst_ops.gc_thresh = ~0;
2591 ip_rt_max_size = INT_MAX;
2592
2593 devinet_init();
2594 ip_fib_init();
2595
2596 if (ip_rt_proc_init())
2597 pr_err("Unable to create route proc files\n");
2598#ifdef CONFIG_XFRM
2599 xfrm_init();
2600 xfrm4_init(ip_rt_max_size);
2601#endif
2602 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2603
2604#ifdef CONFIG_SYSCTL
2605 register_pernet_subsys(&sysctl_route_ops);
2606#endif
2607 register_pernet_subsys(&rt_genid_ops);
2608 register_pernet_subsys(&ipv4_inetpeer_ops);
2609 return rc;
2610}
2611
2612#ifdef CONFIG_SYSCTL
2613
2614
2615
2616
2617void __init ip_static_sysctl_init(void)
2618{
2619 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2620}
2621#endif
2622