1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
71#include <linux/mm.h>
72#include <linux/bootmem.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/workqueue.h>
83#include <linux/skbuff.h>
84#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
93#include <linux/slab.h>
94#include <linux/prefetch.h>
95#include <net/dst.h>
96#include <net/net_namespace.h>
97#include <net/protocol.h>
98#include <net/ip.h>
99#include <net/route.h>
100#include <net/inetpeer.h>
101#include <net/sock.h>
102#include <net/ip_fib.h>
103#include <net/arp.h>
104#include <net/tcp.h>
105#include <net/icmp.h>
106#include <net/xfrm.h>
107#include <net/netevent.h>
108#include <net/rtnetlink.h>
109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
111#endif
112#include <net/secure_seq.h>
113
114#define RT_FL_TOS(oldflp4) \
115 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
116
117#define IP_MAX_MTU 0xFFF0
118
119#define RT_GC_TIMEOUT (300*HZ)
120
121static int ip_rt_max_size;
122static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
123static int ip_rt_gc_interval __read_mostly = 60 * HZ;
124static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
125static int ip_rt_redirect_number __read_mostly = 9;
126static int ip_rt_redirect_load __read_mostly = HZ / 50;
127static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128static int ip_rt_error_cost __read_mostly = HZ;
129static int ip_rt_error_burst __read_mostly = 5 * HZ;
130static int ip_rt_gc_elasticity __read_mostly = 8;
131static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
132static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
133static int ip_rt_min_advmss __read_mostly = 256;
134static int rt_chain_length_max __read_mostly = 20;
135
136static struct delayed_work expires_work;
137static unsigned long expires_ljiffies;
138
139
140
141
142
143static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
144static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
145static unsigned int ipv4_mtu(const struct dst_entry *dst);
146static void ipv4_dst_destroy(struct dst_entry *dst);
147static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
148static void ipv4_link_failure(struct sk_buff *skb);
149static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
150static int rt_garbage_collect(struct dst_ops *ops);
151
152static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
153 int how)
154{
155}
156
157static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
158{
159 struct rtable *rt = (struct rtable *) dst;
160 struct inet_peer *peer;
161 u32 *p = NULL;
162
163 if (!rt->peer)
164 rt_bind_peer(rt, rt->rt_dst, 1);
165
166 peer = rt->peer;
167 if (peer) {
168 u32 *old_p = __DST_METRICS_PTR(old);
169 unsigned long prev, new;
170
171 p = peer->metrics;
172 if (inet_metrics_new(peer))
173 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
174
175 new = (unsigned long) p;
176 prev = cmpxchg(&dst->_metrics, old, new);
177
178 if (prev != old) {
179 p = __DST_METRICS_PTR(prev);
180 if (prev & DST_METRICS_READ_ONLY)
181 p = NULL;
182 } else {
183 if (rt->fi) {
184 fib_info_put(rt->fi);
185 rt->fi = NULL;
186 }
187 }
188 }
189 return p;
190}
191
192static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
193
194static struct dst_ops ipv4_dst_ops = {
195 .family = AF_INET,
196 .protocol = cpu_to_be16(ETH_P_IP),
197 .gc = rt_garbage_collect,
198 .check = ipv4_dst_check,
199 .default_advmss = ipv4_default_advmss,
200 .mtu = ipv4_mtu,
201 .cow_metrics = ipv4_cow_metrics,
202 .destroy = ipv4_dst_destroy,
203 .ifdown = ipv4_dst_ifdown,
204 .negative_advice = ipv4_negative_advice,
205 .link_failure = ipv4_link_failure,
206 .update_pmtu = ip_rt_update_pmtu,
207 .local_out = __ip_local_out,
208 .neigh_lookup = ipv4_neigh_lookup,
209};
210
211#define ECN_OR_COST(class) TC_PRIO_##class
212
213const __u8 ip_tos2prio[16] = {
214 TC_PRIO_BESTEFFORT,
215 ECN_OR_COST(BESTEFFORT),
216 TC_PRIO_BESTEFFORT,
217 ECN_OR_COST(BESTEFFORT),
218 TC_PRIO_BULK,
219 ECN_OR_COST(BULK),
220 TC_PRIO_BULK,
221 ECN_OR_COST(BULK),
222 TC_PRIO_INTERACTIVE,
223 ECN_OR_COST(INTERACTIVE),
224 TC_PRIO_INTERACTIVE,
225 ECN_OR_COST(INTERACTIVE),
226 TC_PRIO_INTERACTIVE_BULK,
227 ECN_OR_COST(INTERACTIVE_BULK),
228 TC_PRIO_INTERACTIVE_BULK,
229 ECN_OR_COST(INTERACTIVE_BULK)
230};
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247struct rt_hash_bucket {
248 struct rtable __rcu *chain;
249};
250
251#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
252 defined(CONFIG_PROVE_LOCKING)
253
254
255
256
257
258#ifdef CONFIG_LOCKDEP
259# define RT_HASH_LOCK_SZ 256
260#else
261# if NR_CPUS >= 32
262# define RT_HASH_LOCK_SZ 4096
263# elif NR_CPUS >= 16
264# define RT_HASH_LOCK_SZ 2048
265# elif NR_CPUS >= 8
266# define RT_HASH_LOCK_SZ 1024
267# elif NR_CPUS >= 4
268# define RT_HASH_LOCK_SZ 512
269# else
270# define RT_HASH_LOCK_SZ 256
271# endif
272#endif
273
274static spinlock_t *rt_hash_locks;
275# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
276
277static __init void rt_hash_lock_init(void)
278{
279 int i;
280
281 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
282 GFP_KERNEL);
283 if (!rt_hash_locks)
284 panic("IP: failed to allocate rt_hash_locks\n");
285
286 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
287 spin_lock_init(&rt_hash_locks[i]);
288}
289#else
290# define rt_hash_lock_addr(slot) NULL
291
292static inline void rt_hash_lock_init(void)
293{
294}
295#endif
296
297static struct rt_hash_bucket *rt_hash_table __read_mostly;
298static unsigned rt_hash_mask __read_mostly;
299static unsigned int rt_hash_log __read_mostly;
300
301static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
302#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
303
304static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
305 int genid)
306{
307 return jhash_3words((__force u32)daddr, (__force u32)saddr,
308 idx, genid)
309 & rt_hash_mask;
310}
311
312static inline int rt_genid(struct net *net)
313{
314 return atomic_read(&net->ipv4.rt_genid);
315}
316
317#ifdef CONFIG_PROC_FS
318struct rt_cache_iter_state {
319 struct seq_net_private p;
320 int bucket;
321 int genid;
322};
323
324static struct rtable *rt_cache_get_first(struct seq_file *seq)
325{
326 struct rt_cache_iter_state *st = seq->private;
327 struct rtable *r = NULL;
328
329 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
330 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
331 continue;
332 rcu_read_lock_bh();
333 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
334 while (r) {
335 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
336 r->rt_genid == st->genid)
337 return r;
338 r = rcu_dereference_bh(r->dst.rt_next);
339 }
340 rcu_read_unlock_bh();
341 }
342 return r;
343}
344
345static struct rtable *__rt_cache_get_next(struct seq_file *seq,
346 struct rtable *r)
347{
348 struct rt_cache_iter_state *st = seq->private;
349
350 r = rcu_dereference_bh(r->dst.rt_next);
351 while (!r) {
352 rcu_read_unlock_bh();
353 do {
354 if (--st->bucket < 0)
355 return NULL;
356 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
357 rcu_read_lock_bh();
358 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
359 }
360 return r;
361}
362
363static struct rtable *rt_cache_get_next(struct seq_file *seq,
364 struct rtable *r)
365{
366 struct rt_cache_iter_state *st = seq->private;
367 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
368 if (dev_net(r->dst.dev) != seq_file_net(seq))
369 continue;
370 if (r->rt_genid == st->genid)
371 break;
372 }
373 return r;
374}
375
376static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
377{
378 struct rtable *r = rt_cache_get_first(seq);
379
380 if (r)
381 while (pos && (r = rt_cache_get_next(seq, r)))
382 --pos;
383 return pos ? NULL : r;
384}
385
386static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
387{
388 struct rt_cache_iter_state *st = seq->private;
389 if (*pos)
390 return rt_cache_get_idx(seq, *pos - 1);
391 st->genid = rt_genid(seq_file_net(seq));
392 return SEQ_START_TOKEN;
393}
394
395static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
396{
397 struct rtable *r;
398
399 if (v == SEQ_START_TOKEN)
400 r = rt_cache_get_first(seq);
401 else
402 r = rt_cache_get_next(seq, v);
403 ++*pos;
404 return r;
405}
406
407static void rt_cache_seq_stop(struct seq_file *seq, void *v)
408{
409 if (v && v != SEQ_START_TOKEN)
410 rcu_read_unlock_bh();
411}
412
413static int rt_cache_seq_show(struct seq_file *seq, void *v)
414{
415 if (v == SEQ_START_TOKEN)
416 seq_printf(seq, "%-127s\n",
417 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
418 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
419 "HHUptod\tSpecDst");
420 else {
421 struct rtable *r = v;
422 struct neighbour *n;
423 int len, HHUptod;
424
425 rcu_read_lock();
426 n = dst_get_neighbour_noref(&r->dst);
427 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
428 rcu_read_unlock();
429
430 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
431 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
432 r->dst.dev ? r->dst.dev->name : "*",
433 (__force u32)r->rt_dst,
434 (__force u32)r->rt_gateway,
435 r->rt_flags, atomic_read(&r->dst.__refcnt),
436 r->dst.__use, 0, (__force u32)r->rt_src,
437 dst_metric_advmss(&r->dst) + 40,
438 dst_metric(&r->dst, RTAX_WINDOW),
439 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
440 dst_metric(&r->dst, RTAX_RTTVAR)),
441 r->rt_key_tos,
442 -1,
443 HHUptod,
444 r->rt_spec_dst, &len);
445
446 seq_printf(seq, "%*s\n", 127 - len, "");
447 }
448 return 0;
449}
450
451static const struct seq_operations rt_cache_seq_ops = {
452 .start = rt_cache_seq_start,
453 .next = rt_cache_seq_next,
454 .stop = rt_cache_seq_stop,
455 .show = rt_cache_seq_show,
456};
457
458static int rt_cache_seq_open(struct inode *inode, struct file *file)
459{
460 return seq_open_net(inode, file, &rt_cache_seq_ops,
461 sizeof(struct rt_cache_iter_state));
462}
463
464static const struct file_operations rt_cache_seq_fops = {
465 .owner = THIS_MODULE,
466 .open = rt_cache_seq_open,
467 .read = seq_read,
468 .llseek = seq_lseek,
469 .release = seq_release_net,
470};
471
472
473static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
474{
475 int cpu;
476
477 if (*pos == 0)
478 return SEQ_START_TOKEN;
479
480 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
481 if (!cpu_possible(cpu))
482 continue;
483 *pos = cpu+1;
484 return &per_cpu(rt_cache_stat, cpu);
485 }
486 return NULL;
487}
488
489static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
490{
491 int cpu;
492
493 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
494 if (!cpu_possible(cpu))
495 continue;
496 *pos = cpu+1;
497 return &per_cpu(rt_cache_stat, cpu);
498 }
499 return NULL;
500
501}
502
503static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
504{
505
506}
507
508static int rt_cpu_seq_show(struct seq_file *seq, void *v)
509{
510 struct rt_cache_stat *st = v;
511
512 if (v == SEQ_START_TOKEN) {
513 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
514 return 0;
515 }
516
517 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
518 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
519 dst_entries_get_slow(&ipv4_dst_ops),
520 st->in_hit,
521 st->in_slow_tot,
522 st->in_slow_mc,
523 st->in_no_route,
524 st->in_brd,
525 st->in_martian_dst,
526 st->in_martian_src,
527
528 st->out_hit,
529 st->out_slow_tot,
530 st->out_slow_mc,
531
532 st->gc_total,
533 st->gc_ignored,
534 st->gc_goal_miss,
535 st->gc_dst_overflow,
536 st->in_hlist_search,
537 st->out_hlist_search
538 );
539 return 0;
540}
541
542static const struct seq_operations rt_cpu_seq_ops = {
543 .start = rt_cpu_seq_start,
544 .next = rt_cpu_seq_next,
545 .stop = rt_cpu_seq_stop,
546 .show = rt_cpu_seq_show,
547};
548
549
550static int rt_cpu_seq_open(struct inode *inode, struct file *file)
551{
552 return seq_open(file, &rt_cpu_seq_ops);
553}
554
555static const struct file_operations rt_cpu_seq_fops = {
556 .owner = THIS_MODULE,
557 .open = rt_cpu_seq_open,
558 .read = seq_read,
559 .llseek = seq_lseek,
560 .release = seq_release,
561};
562
563#ifdef CONFIG_IP_ROUTE_CLASSID
564static int rt_acct_proc_show(struct seq_file *m, void *v)
565{
566 struct ip_rt_acct *dst, *src;
567 unsigned int i, j;
568
569 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
570 if (!dst)
571 return -ENOMEM;
572
573 for_each_possible_cpu(i) {
574 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
575 for (j = 0; j < 256; j++) {
576 dst[j].o_bytes += src[j].o_bytes;
577 dst[j].o_packets += src[j].o_packets;
578 dst[j].i_bytes += src[j].i_bytes;
579 dst[j].i_packets += src[j].i_packets;
580 }
581 }
582
583 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
584 kfree(dst);
585 return 0;
586}
587
588static int rt_acct_proc_open(struct inode *inode, struct file *file)
589{
590 return single_open(file, rt_acct_proc_show, NULL);
591}
592
593static const struct file_operations rt_acct_proc_fops = {
594 .owner = THIS_MODULE,
595 .open = rt_acct_proc_open,
596 .read = seq_read,
597 .llseek = seq_lseek,
598 .release = single_release,
599};
600#endif
601
602static int __net_init ip_rt_do_proc_init(struct net *net)
603{
604 struct proc_dir_entry *pde;
605
606 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
607 &rt_cache_seq_fops);
608 if (!pde)
609 goto err1;
610
611 pde = proc_create("rt_cache", S_IRUGO,
612 net->proc_net_stat, &rt_cpu_seq_fops);
613 if (!pde)
614 goto err2;
615
616#ifdef CONFIG_IP_ROUTE_CLASSID
617 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
618 if (!pde)
619 goto err3;
620#endif
621 return 0;
622
623#ifdef CONFIG_IP_ROUTE_CLASSID
624err3:
625 remove_proc_entry("rt_cache", net->proc_net_stat);
626#endif
627err2:
628 remove_proc_entry("rt_cache", net->proc_net);
629err1:
630 return -ENOMEM;
631}
632
633static void __net_exit ip_rt_do_proc_exit(struct net *net)
634{
635 remove_proc_entry("rt_cache", net->proc_net_stat);
636 remove_proc_entry("rt_cache", net->proc_net);
637#ifdef CONFIG_IP_ROUTE_CLASSID
638 remove_proc_entry("rt_acct", net->proc_net);
639#endif
640}
641
642static struct pernet_operations ip_rt_proc_ops __net_initdata = {
643 .init = ip_rt_do_proc_init,
644 .exit = ip_rt_do_proc_exit,
645};
646
647static int __init ip_rt_proc_init(void)
648{
649 return register_pernet_subsys(&ip_rt_proc_ops);
650}
651
652#else
653static inline int ip_rt_proc_init(void)
654{
655 return 0;
656}
657#endif
658
659static inline void rt_free(struct rtable *rt)
660{
661 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
662}
663
664static inline void rt_drop(struct rtable *rt)
665{
666 ip_rt_put(rt);
667 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
668}
669
670static inline int rt_fast_clean(struct rtable *rth)
671{
672
673
674 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
675 rt_is_input_route(rth) && rth->dst.rt_next;
676}
677
678static inline int rt_valuable(struct rtable *rth)
679{
680 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
681 (rth->peer && rth->peer->pmtu_expires);
682}
683
684static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
685{
686 unsigned long age;
687 int ret = 0;
688
689 if (atomic_read(&rth->dst.__refcnt))
690 goto out;
691
692 age = jiffies - rth->dst.lastuse;
693 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
694 (age <= tmo2 && rt_valuable(rth)))
695 goto out;
696 ret = 1;
697out: return ret;
698}
699
700
701
702
703
704
705static inline u32 rt_score(struct rtable *rt)
706{
707 u32 score = jiffies - rt->dst.lastuse;
708
709 score = ~score & ~(3<<30);
710
711 if (rt_valuable(rt))
712 score |= (1<<31);
713
714 if (rt_is_output_route(rt) ||
715 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
716 score |= (1<<30);
717
718 return score;
719}
720
721static inline bool rt_caching(const struct net *net)
722{
723 return net->ipv4.current_rt_cache_rebuild_count <=
724 net->ipv4.sysctl_rt_cache_rebuild_count;
725}
726
727static inline bool compare_hash_inputs(const struct rtable *rt1,
728 const struct rtable *rt2)
729{
730 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
731 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
732 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
733}
734
735static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
736{
737 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
738 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
739 (rt1->rt_mark ^ rt2->rt_mark) |
740 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
741 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
742 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
743}
744
745static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
746{
747 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
748}
749
750static inline int rt_is_expired(struct rtable *rth)
751{
752 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
753}
754
755
756
757
758
759
760static void rt_do_flush(struct net *net, int process_context)
761{
762 unsigned int i;
763 struct rtable *rth, *next;
764
765 for (i = 0; i <= rt_hash_mask; i++) {
766 struct rtable __rcu **pprev;
767 struct rtable *list;
768
769 if (process_context && need_resched())
770 cond_resched();
771 rth = rcu_access_pointer(rt_hash_table[i].chain);
772 if (!rth)
773 continue;
774
775 spin_lock_bh(rt_hash_lock_addr(i));
776
777 list = NULL;
778 pprev = &rt_hash_table[i].chain;
779 rth = rcu_dereference_protected(*pprev,
780 lockdep_is_held(rt_hash_lock_addr(i)));
781
782 while (rth) {
783 next = rcu_dereference_protected(rth->dst.rt_next,
784 lockdep_is_held(rt_hash_lock_addr(i)));
785
786 if (!net ||
787 net_eq(dev_net(rth->dst.dev), net)) {
788 rcu_assign_pointer(*pprev, next);
789 rcu_assign_pointer(rth->dst.rt_next, list);
790 list = rth;
791 } else {
792 pprev = &rth->dst.rt_next;
793 }
794 rth = next;
795 }
796
797 spin_unlock_bh(rt_hash_lock_addr(i));
798
799 for (; list; list = next) {
800 next = rcu_dereference_protected(list->dst.rt_next, 1);
801 rt_free(list);
802 }
803 }
804}
805
806
807
808
809
810
811
812
813
814#define FRACT_BITS 3
815#define ONE (1UL << FRACT_BITS)
816
817
818
819
820
821
822
823
824static int has_noalias(const struct rtable *head, const struct rtable *rth)
825{
826 const struct rtable *aux = head;
827
828 while (aux != rth) {
829 if (compare_hash_inputs(aux, rth))
830 return 0;
831 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
832 }
833 return ONE;
834}
835
836static void rt_check_expire(void)
837{
838 static unsigned int rover;
839 unsigned int i = rover, goal;
840 struct rtable *rth;
841 struct rtable __rcu **rthp;
842 unsigned long samples = 0;
843 unsigned long sum = 0, sum2 = 0;
844 unsigned long delta;
845 u64 mult;
846
847 delta = jiffies - expires_ljiffies;
848 expires_ljiffies = jiffies;
849 mult = ((u64)delta) << rt_hash_log;
850 if (ip_rt_gc_timeout > 1)
851 do_div(mult, ip_rt_gc_timeout);
852 goal = (unsigned int)mult;
853 if (goal > rt_hash_mask)
854 goal = rt_hash_mask + 1;
855 for (; goal > 0; goal--) {
856 unsigned long tmo = ip_rt_gc_timeout;
857 unsigned long length;
858
859 i = (i + 1) & rt_hash_mask;
860 rthp = &rt_hash_table[i].chain;
861
862 if (need_resched())
863 cond_resched();
864
865 samples++;
866
867 if (rcu_dereference_raw(*rthp) == NULL)
868 continue;
869 length = 0;
870 spin_lock_bh(rt_hash_lock_addr(i));
871 while ((rth = rcu_dereference_protected(*rthp,
872 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
873 prefetch(rth->dst.rt_next);
874 if (rt_is_expired(rth)) {
875 *rthp = rth->dst.rt_next;
876 rt_free(rth);
877 continue;
878 }
879 if (rth->dst.expires) {
880
881 if (time_before_eq(jiffies, rth->dst.expires)) {
882nofree:
883 tmo >>= 1;
884 rthp = &rth->dst.rt_next;
885
886
887
888
889
890
891
892
893 length += has_noalias(rt_hash_table[i].chain, rth);
894 continue;
895 }
896 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
897 goto nofree;
898
899
900 *rthp = rth->dst.rt_next;
901 rt_free(rth);
902 }
903 spin_unlock_bh(rt_hash_lock_addr(i));
904 sum += length;
905 sum2 += length*length;
906 }
907 if (samples) {
908 unsigned long avg = sum / samples;
909 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
910 rt_chain_length_max = max_t(unsigned long,
911 ip_rt_gc_elasticity,
912 (avg + 4*sd) >> FRACT_BITS);
913 }
914 rover = i;
915}
916
917
918
919
920
921static void rt_worker_func(struct work_struct *work)
922{
923 rt_check_expire();
924 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
925}
926
927
928
929
930
931
932
933static void rt_cache_invalidate(struct net *net)
934{
935 unsigned char shuffle;
936
937 get_random_bytes(&shuffle, sizeof(shuffle));
938 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
939 inetpeer_invalidate_tree(AF_INET);
940}
941
942
943
944
945
946void rt_cache_flush(struct net *net, int delay)
947{
948 rt_cache_invalidate(net);
949 if (delay >= 0)
950 rt_do_flush(net, !in_softirq());
951}
952
953
954void rt_cache_flush_batch(struct net *net)
955{
956 rt_do_flush(net, !in_softirq());
957}
958
959static void rt_emergency_hash_rebuild(struct net *net)
960{
961 if (net_ratelimit())
962 printk(KERN_WARNING "Route hash chain too long!\n");
963 rt_cache_invalidate(net);
964}
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979static int rt_garbage_collect(struct dst_ops *ops)
980{
981 static unsigned long expire = RT_GC_TIMEOUT;
982 static unsigned long last_gc;
983 static int rover;
984 static int equilibrium;
985 struct rtable *rth;
986 struct rtable __rcu **rthp;
987 unsigned long now = jiffies;
988 int goal;
989 int entries = dst_entries_get_fast(&ipv4_dst_ops);
990
991
992
993
994
995
996 RT_CACHE_STAT_INC(gc_total);
997
998 if (now - last_gc < ip_rt_gc_min_interval &&
999 entries < ip_rt_max_size) {
1000 RT_CACHE_STAT_INC(gc_ignored);
1001 goto out;
1002 }
1003
1004 entries = dst_entries_get_slow(&ipv4_dst_ops);
1005
1006 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1007 if (goal <= 0) {
1008 if (equilibrium < ipv4_dst_ops.gc_thresh)
1009 equilibrium = ipv4_dst_ops.gc_thresh;
1010 goal = entries - equilibrium;
1011 if (goal > 0) {
1012 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1013 goal = entries - equilibrium;
1014 }
1015 } else {
1016
1017
1018
1019 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1020 equilibrium = entries - goal;
1021 }
1022
1023 if (now - last_gc >= ip_rt_gc_min_interval)
1024 last_gc = now;
1025
1026 if (goal <= 0) {
1027 equilibrium += goal;
1028 goto work_done;
1029 }
1030
1031 do {
1032 int i, k;
1033
1034 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1035 unsigned long tmo = expire;
1036
1037 k = (k + 1) & rt_hash_mask;
1038 rthp = &rt_hash_table[k].chain;
1039 spin_lock_bh(rt_hash_lock_addr(k));
1040 while ((rth = rcu_dereference_protected(*rthp,
1041 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1042 if (!rt_is_expired(rth) &&
1043 !rt_may_expire(rth, tmo, expire)) {
1044 tmo >>= 1;
1045 rthp = &rth->dst.rt_next;
1046 continue;
1047 }
1048 *rthp = rth->dst.rt_next;
1049 rt_free(rth);
1050 goal--;
1051 }
1052 spin_unlock_bh(rt_hash_lock_addr(k));
1053 if (goal <= 0)
1054 break;
1055 }
1056 rover = k;
1057
1058 if (goal <= 0)
1059 goto work_done;
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070 RT_CACHE_STAT_INC(gc_goal_miss);
1071
1072 if (expire == 0)
1073 break;
1074
1075 expire >>= 1;
1076
1077 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1078 goto out;
1079 } while (!in_softirq() && time_before_eq(jiffies, now));
1080
1081 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1082 goto out;
1083 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1084 goto out;
1085 if (net_ratelimit())
1086 printk(KERN_WARNING "dst cache overflow\n");
1087 RT_CACHE_STAT_INC(gc_dst_overflow);
1088 return 1;
1089
1090work_done:
1091 expire += ip_rt_gc_min_interval;
1092 if (expire > ip_rt_gc_timeout ||
1093 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1094 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1095 expire = ip_rt_gc_timeout;
1096out: return 0;
1097}
1098
1099
1100
1101
1102static int slow_chain_length(const struct rtable *head)
1103{
1104 int length = 0;
1105 const struct rtable *rth = head;
1106
1107 while (rth) {
1108 length += has_noalias(head, rth);
1109 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1110 }
1111 return length >> FRACT_BITS;
1112}
1113
1114static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1115{
1116 static const __be32 inaddr_any = 0;
1117 struct net_device *dev = dst->dev;
1118 const __be32 *pkey = daddr;
1119 struct neighbour *n;
1120
1121 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1122 pkey = &inaddr_any;
1123
1124 n = __ipv4_neigh_lookup(&arp_tbl, dev, *(__force u32 *)pkey);
1125 if (n)
1126 return n;
1127 return neigh_create(&arp_tbl, pkey, dev);
1128}
1129
1130static int rt_bind_neighbour(struct rtable *rt)
1131{
1132 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1133 if (IS_ERR(n))
1134 return PTR_ERR(n);
1135 dst_set_neighbour(&rt->dst, n);
1136
1137 return 0;
1138}
1139
1140static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1141 struct sk_buff *skb, int ifindex)
1142{
1143 struct rtable *rth, *cand;
1144 struct rtable __rcu **rthp, **candp;
1145 unsigned long now;
1146 u32 min_score;
1147 int chain_length;
1148 int attempts = !in_softirq();
1149
1150restart:
1151 chain_length = 0;
1152 min_score = ~(u32)0;
1153 cand = NULL;
1154 candp = NULL;
1155 now = jiffies;
1156
1157 if (!rt_caching(dev_net(rt->dst.dev))) {
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174 rt->dst.flags |= DST_NOCACHE;
1175 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1176 int err = rt_bind_neighbour(rt);
1177 if (err) {
1178 if (net_ratelimit())
1179 printk(KERN_WARNING
1180 "Neighbour table failure & not caching routes.\n");
1181 ip_rt_put(rt);
1182 return ERR_PTR(err);
1183 }
1184 }
1185
1186 goto skip_hashing;
1187 }
1188
1189 rthp = &rt_hash_table[hash].chain;
1190
1191 spin_lock_bh(rt_hash_lock_addr(hash));
1192 while ((rth = rcu_dereference_protected(*rthp,
1193 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1194 if (rt_is_expired(rth)) {
1195 *rthp = rth->dst.rt_next;
1196 rt_free(rth);
1197 continue;
1198 }
1199 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1200
1201 *rthp = rth->dst.rt_next;
1202
1203
1204
1205
1206
1207 rcu_assign_pointer(rth->dst.rt_next,
1208 rt_hash_table[hash].chain);
1209
1210
1211
1212
1213 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1214
1215 dst_use(&rth->dst, now);
1216 spin_unlock_bh(rt_hash_lock_addr(hash));
1217
1218 rt_drop(rt);
1219 if (skb)
1220 skb_dst_set(skb, &rth->dst);
1221 return rth;
1222 }
1223
1224 if (!atomic_read(&rth->dst.__refcnt)) {
1225 u32 score = rt_score(rth);
1226
1227 if (score <= min_score) {
1228 cand = rth;
1229 candp = rthp;
1230 min_score = score;
1231 }
1232 }
1233
1234 chain_length++;
1235
1236 rthp = &rth->dst.rt_next;
1237 }
1238
1239 if (cand) {
1240
1241
1242
1243
1244
1245
1246 if (chain_length > ip_rt_gc_elasticity) {
1247 *candp = cand->dst.rt_next;
1248 rt_free(cand);
1249 }
1250 } else {
1251 if (chain_length > rt_chain_length_max &&
1252 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1253 struct net *net = dev_net(rt->dst.dev);
1254 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1255 if (!rt_caching(net)) {
1256 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1257 rt->dst.dev->name, num);
1258 }
1259 rt_emergency_hash_rebuild(net);
1260 spin_unlock_bh(rt_hash_lock_addr(hash));
1261
1262 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1263 ifindex, rt_genid(net));
1264 goto restart;
1265 }
1266 }
1267
1268
1269
1270
1271 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1272 int err = rt_bind_neighbour(rt);
1273 if (err) {
1274 spin_unlock_bh(rt_hash_lock_addr(hash));
1275
1276 if (err != -ENOBUFS) {
1277 rt_drop(rt);
1278 return ERR_PTR(err);
1279 }
1280
1281
1282
1283
1284
1285 if (attempts-- > 0) {
1286 int saved_elasticity = ip_rt_gc_elasticity;
1287 int saved_int = ip_rt_gc_min_interval;
1288 ip_rt_gc_elasticity = 1;
1289 ip_rt_gc_min_interval = 0;
1290 rt_garbage_collect(&ipv4_dst_ops);
1291 ip_rt_gc_min_interval = saved_int;
1292 ip_rt_gc_elasticity = saved_elasticity;
1293 goto restart;
1294 }
1295
1296 if (net_ratelimit())
1297 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1298 rt_drop(rt);
1299 return ERR_PTR(-ENOBUFS);
1300 }
1301 }
1302
1303 rt->dst.rt_next = rt_hash_table[hash].chain;
1304
1305
1306
1307
1308
1309
1310 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1311
1312 spin_unlock_bh(rt_hash_lock_addr(hash));
1313
1314skip_hashing:
1315 if (skb)
1316 skb_dst_set(skb, &rt->dst);
1317 return rt;
1318}
1319
1320static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1321
1322static u32 rt_peer_genid(void)
1323{
1324 return atomic_read(&__rt_peer_genid);
1325}
1326
1327void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1328{
1329 struct inet_peer *peer;
1330
1331 peer = inet_getpeer_v4(daddr, create);
1332
1333 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1334 inet_putpeer(peer);
1335 else
1336 rt->rt_peer_genid = rt_peer_genid();
1337}
1338
1339
1340
1341
1342
1343
1344
1345
1346static void ip_select_fb_ident(struct iphdr *iph)
1347{
1348 static DEFINE_SPINLOCK(ip_fb_id_lock);
1349 static u32 ip_fallback_id;
1350 u32 salt;
1351
1352 spin_lock_bh(&ip_fb_id_lock);
1353 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1354 iph->id = htons(salt & 0xFFFF);
1355 ip_fallback_id = salt;
1356 spin_unlock_bh(&ip_fb_id_lock);
1357}
1358
1359void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1360{
1361 struct rtable *rt = (struct rtable *) dst;
1362
1363 if (rt && !(rt->dst.flags & DST_NOPEER)) {
1364 if (rt->peer == NULL)
1365 rt_bind_peer(rt, rt->rt_dst, 1);
1366
1367
1368
1369
1370 if (rt->peer) {
1371 iph->id = htons(inet_getid(rt->peer, more));
1372 return;
1373 }
1374 } else if (!rt)
1375 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1376 __builtin_return_address(0));
1377
1378 ip_select_fb_ident(iph);
1379}
1380EXPORT_SYMBOL(__ip_select_ident);
1381
1382static void rt_del(unsigned hash, struct rtable *rt)
1383{
1384 struct rtable __rcu **rthp;
1385 struct rtable *aux;
1386
1387 rthp = &rt_hash_table[hash].chain;
1388 spin_lock_bh(rt_hash_lock_addr(hash));
1389 ip_rt_put(rt);
1390 while ((aux = rcu_dereference_protected(*rthp,
1391 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1392 if (aux == rt || rt_is_expired(aux)) {
1393 *rthp = aux->dst.rt_next;
1394 rt_free(aux);
1395 continue;
1396 }
1397 rthp = &aux->dst.rt_next;
1398 }
1399 spin_unlock_bh(rt_hash_lock_addr(hash));
1400}
1401
1402static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1403{
1404 struct rtable *rt = (struct rtable *) dst;
1405 __be32 orig_gw = rt->rt_gateway;
1406 struct neighbour *n, *old_n;
1407
1408 dst_confirm(&rt->dst);
1409
1410 rt->rt_gateway = peer->redirect_learned.a4;
1411
1412 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1413 if (IS_ERR(n)) {
1414 rt->rt_gateway = orig_gw;
1415 return;
1416 }
1417 old_n = xchg(&rt->dst._neighbour, n);
1418 if (old_n)
1419 neigh_release(old_n);
1420 if (!(n->nud_state & NUD_VALID)) {
1421 neigh_event_send(n, NULL);
1422 } else {
1423 rt->rt_flags |= RTCF_REDIRECTED;
1424 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1425 }
1426}
1427
1428
1429void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1430 __be32 saddr, struct net_device *dev)
1431{
1432 int s, i;
1433 struct in_device *in_dev = __in_dev_get_rcu(dev);
1434 __be32 skeys[2] = { saddr, 0 };
1435 int ikeys[2] = { dev->ifindex, 0 };
1436 struct inet_peer *peer;
1437 struct net *net;
1438
1439 if (!in_dev)
1440 return;
1441
1442 net = dev_net(dev);
1443 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1444 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1445 ipv4_is_zeronet(new_gw))
1446 goto reject_redirect;
1447
1448 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1449 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1450 goto reject_redirect;
1451 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1452 goto reject_redirect;
1453 } else {
1454 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1455 goto reject_redirect;
1456 }
1457
1458 for (s = 0; s < 2; s++) {
1459 for (i = 0; i < 2; i++) {
1460 unsigned int hash;
1461 struct rtable __rcu **rthp;
1462 struct rtable *rt;
1463
1464 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1465
1466 rthp = &rt_hash_table[hash].chain;
1467
1468 while ((rt = rcu_dereference(*rthp)) != NULL) {
1469 rthp = &rt->dst.rt_next;
1470
1471 if (rt->rt_key_dst != daddr ||
1472 rt->rt_key_src != skeys[s] ||
1473 rt->rt_oif != ikeys[i] ||
1474 rt_is_input_route(rt) ||
1475 rt_is_expired(rt) ||
1476 !net_eq(dev_net(rt->dst.dev), net) ||
1477 rt->dst.error ||
1478 rt->dst.dev != dev ||
1479 rt->rt_gateway != old_gw)
1480 continue;
1481
1482 if (!rt->peer)
1483 rt_bind_peer(rt, rt->rt_dst, 1);
1484
1485 peer = rt->peer;
1486 if (peer) {
1487 if (peer->redirect_learned.a4 != new_gw) {
1488 peer->redirect_learned.a4 = new_gw;
1489 atomic_inc(&__rt_peer_genid);
1490 }
1491 check_peer_redir(&rt->dst, peer);
1492 }
1493 }
1494 }
1495 }
1496 return;
1497
1498reject_redirect:
1499#ifdef CONFIG_IP_ROUTE_VERBOSE
1500 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1501 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1502 " Advised path = %pI4 -> %pI4\n",
1503 &old_gw, dev->name, &new_gw,
1504 &saddr, &daddr);
1505#endif
1506 ;
1507}
1508
1509static bool peer_pmtu_expired(struct inet_peer *peer)
1510{
1511 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1512
1513 return orig &&
1514 time_after_eq(jiffies, orig) &&
1515 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1516}
1517
1518static bool peer_pmtu_cleaned(struct inet_peer *peer)
1519{
1520 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1521
1522 return orig &&
1523 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1524}
1525
1526static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1527{
1528 struct rtable *rt = (struct rtable *)dst;
1529 struct dst_entry *ret = dst;
1530
1531 if (rt) {
1532 if (dst->obsolete > 0) {
1533 ip_rt_put(rt);
1534 ret = NULL;
1535 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1536 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1537 rt->rt_oif,
1538 rt_genid(dev_net(dst->dev)));
1539 rt_del(hash, rt);
1540 ret = NULL;
1541 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1542 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1543 }
1544 }
1545 return ret;
1546}
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564void ip_rt_send_redirect(struct sk_buff *skb)
1565{
1566 struct rtable *rt = skb_rtable(skb);
1567 struct in_device *in_dev;
1568 struct inet_peer *peer;
1569 int log_martians;
1570
1571 rcu_read_lock();
1572 in_dev = __in_dev_get_rcu(rt->dst.dev);
1573 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1574 rcu_read_unlock();
1575 return;
1576 }
1577 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1578 rcu_read_unlock();
1579
1580 if (!rt->peer)
1581 rt_bind_peer(rt, rt->rt_dst, 1);
1582 peer = rt->peer;
1583 if (!peer) {
1584 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1585 return;
1586 }
1587
1588
1589
1590
1591 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1592 peer->rate_tokens = 0;
1593
1594
1595
1596
1597 if (peer->rate_tokens >= ip_rt_redirect_number) {
1598 peer->rate_last = jiffies;
1599 return;
1600 }
1601
1602
1603
1604
1605 if (peer->rate_tokens == 0 ||
1606 time_after(jiffies,
1607 (peer->rate_last +
1608 (ip_rt_redirect_load << peer->rate_tokens)))) {
1609 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1610 peer->rate_last = jiffies;
1611 ++peer->rate_tokens;
1612#ifdef CONFIG_IP_ROUTE_VERBOSE
1613 if (log_martians &&
1614 peer->rate_tokens == ip_rt_redirect_number &&
1615 net_ratelimit())
1616 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1617 &ip_hdr(skb)->saddr, rt->rt_iif,
1618 &rt->rt_dst, &rt->rt_gateway);
1619#endif
1620 }
1621}
1622
1623static int ip_error(struct sk_buff *skb)
1624{
1625 struct rtable *rt = skb_rtable(skb);
1626 struct inet_peer *peer;
1627 unsigned long now;
1628 bool send;
1629 int code;
1630
1631 switch (rt->dst.error) {
1632 case EINVAL:
1633 default:
1634 goto out;
1635 case EHOSTUNREACH:
1636 code = ICMP_HOST_UNREACH;
1637 break;
1638 case ENETUNREACH:
1639 code = ICMP_NET_UNREACH;
1640 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1641 IPSTATS_MIB_INNOROUTES);
1642 break;
1643 case EACCES:
1644 code = ICMP_PKT_FILTERED;
1645 break;
1646 }
1647
1648 if (!rt->peer)
1649 rt_bind_peer(rt, rt->rt_dst, 1);
1650 peer = rt->peer;
1651
1652 send = true;
1653 if (peer) {
1654 now = jiffies;
1655 peer->rate_tokens += now - peer->rate_last;
1656 if (peer->rate_tokens > ip_rt_error_burst)
1657 peer->rate_tokens = ip_rt_error_burst;
1658 peer->rate_last = now;
1659 if (peer->rate_tokens >= ip_rt_error_cost)
1660 peer->rate_tokens -= ip_rt_error_cost;
1661 else
1662 send = false;
1663 }
1664 if (send)
1665 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1666
1667out: kfree_skb(skb);
1668 return 0;
1669}
1670
1671
1672
1673
1674
1675
1676static const unsigned short mtu_plateau[] =
1677{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1678
1679static inline unsigned short guess_mtu(unsigned short old_mtu)
1680{
1681 int i;
1682
1683 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1684 if (old_mtu > mtu_plateau[i])
1685 return mtu_plateau[i];
1686 return 68;
1687}
1688
1689unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1690 unsigned short new_mtu,
1691 struct net_device *dev)
1692{
1693 unsigned short old_mtu = ntohs(iph->tot_len);
1694 unsigned short est_mtu = 0;
1695 struct inet_peer *peer;
1696
1697 peer = inet_getpeer_v4(iph->daddr, 1);
1698 if (peer) {
1699 unsigned short mtu = new_mtu;
1700
1701 if (new_mtu < 68 || new_mtu >= old_mtu) {
1702
1703
1704
1705
1706 if (mtu == 0 &&
1707 old_mtu >= 68 + (iph->ihl << 2))
1708 old_mtu -= iph->ihl << 2;
1709 mtu = guess_mtu(old_mtu);
1710 }
1711
1712 if (mtu < ip_rt_min_pmtu)
1713 mtu = ip_rt_min_pmtu;
1714 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1715 unsigned long pmtu_expires;
1716
1717 pmtu_expires = jiffies + ip_rt_mtu_expires;
1718 if (!pmtu_expires)
1719 pmtu_expires = 1UL;
1720
1721 est_mtu = mtu;
1722 peer->pmtu_learned = mtu;
1723 peer->pmtu_expires = pmtu_expires;
1724 atomic_inc(&__rt_peer_genid);
1725 }
1726
1727 inet_putpeer(peer);
1728 }
1729 return est_mtu ? : new_mtu;
1730}
1731
1732static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1733{
1734 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1735
1736 if (!expires)
1737 return;
1738 if (time_before(jiffies, expires)) {
1739 u32 orig_dst_mtu = dst_mtu(dst);
1740 if (peer->pmtu_learned < orig_dst_mtu) {
1741 if (!peer->pmtu_orig)
1742 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1743 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1744 }
1745 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1746 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1747}
1748
1749static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1750{
1751 struct rtable *rt = (struct rtable *) dst;
1752 struct inet_peer *peer;
1753
1754 dst_confirm(dst);
1755
1756 if (!rt->peer)
1757 rt_bind_peer(rt, rt->rt_dst, 1);
1758 peer = rt->peer;
1759 if (peer) {
1760 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1761
1762 if (mtu < ip_rt_min_pmtu)
1763 mtu = ip_rt_min_pmtu;
1764 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1765
1766 pmtu_expires = jiffies + ip_rt_mtu_expires;
1767 if (!pmtu_expires)
1768 pmtu_expires = 1UL;
1769
1770 peer->pmtu_learned = mtu;
1771 peer->pmtu_expires = pmtu_expires;
1772
1773 atomic_inc(&__rt_peer_genid);
1774 rt->rt_peer_genid = rt_peer_genid();
1775 }
1776 check_peer_pmtu(dst, peer);
1777 }
1778}
1779
1780
1781static void ipv4_validate_peer(struct rtable *rt)
1782{
1783 if (rt->rt_peer_genid != rt_peer_genid()) {
1784 struct inet_peer *peer;
1785
1786 if (!rt->peer)
1787 rt_bind_peer(rt, rt->rt_dst, 0);
1788
1789 peer = rt->peer;
1790 if (peer) {
1791 check_peer_pmtu(&rt->dst, peer);
1792
1793 if (peer->redirect_learned.a4 &&
1794 peer->redirect_learned.a4 != rt->rt_gateway)
1795 check_peer_redir(&rt->dst, peer);
1796 }
1797
1798 rt->rt_peer_genid = rt_peer_genid();
1799 }
1800}
1801
1802static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1803{
1804 struct rtable *rt = (struct rtable *) dst;
1805
1806 if (rt_is_expired(rt))
1807 return NULL;
1808 ipv4_validate_peer(rt);
1809 return dst;
1810}
1811
1812static void ipv4_dst_destroy(struct dst_entry *dst)
1813{
1814 struct rtable *rt = (struct rtable *) dst;
1815 struct inet_peer *peer = rt->peer;
1816
1817 if (rt->fi) {
1818 fib_info_put(rt->fi);
1819 rt->fi = NULL;
1820 }
1821 if (peer) {
1822 rt->peer = NULL;
1823 inet_putpeer(peer);
1824 }
1825}
1826
1827
1828static void ipv4_link_failure(struct sk_buff *skb)
1829{
1830 struct rtable *rt;
1831
1832 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1833
1834 rt = skb_rtable(skb);
1835 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1836 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1837}
1838
1839static int ip_rt_bug(struct sk_buff *skb)
1840{
1841 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1842 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1843 skb->dev ? skb->dev->name : "?");
1844 kfree_skb(skb);
1845 WARN_ON(1);
1846 return 0;
1847}
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1859{
1860 __be32 src;
1861
1862 if (rt_is_output_route(rt))
1863 src = ip_hdr(skb)->saddr;
1864 else {
1865 struct fib_result res;
1866 struct flowi4 fl4;
1867 struct iphdr *iph;
1868
1869 iph = ip_hdr(skb);
1870
1871 memset(&fl4, 0, sizeof(fl4));
1872 fl4.daddr = iph->daddr;
1873 fl4.saddr = iph->saddr;
1874 fl4.flowi4_tos = RT_TOS(iph->tos);
1875 fl4.flowi4_oif = rt->dst.dev->ifindex;
1876 fl4.flowi4_iif = skb->dev->ifindex;
1877 fl4.flowi4_mark = skb->mark;
1878
1879 rcu_read_lock();
1880 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1881 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1882 else
1883 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1884 RT_SCOPE_UNIVERSE);
1885 rcu_read_unlock();
1886 }
1887 memcpy(addr, &src, 4);
1888}
1889
1890#ifdef CONFIG_IP_ROUTE_CLASSID
1891static void set_class_tag(struct rtable *rt, u32 tag)
1892{
1893 if (!(rt->dst.tclassid & 0xFFFF))
1894 rt->dst.tclassid |= tag & 0xFFFF;
1895 if (!(rt->dst.tclassid & 0xFFFF0000))
1896 rt->dst.tclassid |= tag & 0xFFFF0000;
1897}
1898#endif
1899
1900static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1901{
1902 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1903
1904 if (advmss == 0) {
1905 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1906 ip_rt_min_advmss);
1907 if (advmss > 65535 - 40)
1908 advmss = 65535 - 40;
1909 }
1910 return advmss;
1911}
1912
1913static unsigned int ipv4_mtu(const struct dst_entry *dst)
1914{
1915 const struct rtable *rt = (const struct rtable *) dst;
1916 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1917
1918 if (mtu && rt_is_output_route(rt))
1919 return mtu;
1920
1921 mtu = dst->dev->mtu;
1922
1923 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1924
1925 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1926 mtu = 576;
1927 }
1928
1929 if (mtu > IP_MAX_MTU)
1930 mtu = IP_MAX_MTU;
1931
1932 return mtu;
1933}
1934
1935static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1936 struct fib_info *fi)
1937{
1938 struct inet_peer *peer;
1939 int create = 0;
1940
1941
1942
1943
1944 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1945 create = 1;
1946
1947 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1948 if (peer) {
1949 rt->rt_peer_genid = rt_peer_genid();
1950 if (inet_metrics_new(peer))
1951 memcpy(peer->metrics, fi->fib_metrics,
1952 sizeof(u32) * RTAX_MAX);
1953 dst_init_metrics(&rt->dst, peer->metrics, false);
1954
1955 check_peer_pmtu(&rt->dst, peer);
1956
1957 if (peer->redirect_learned.a4 &&
1958 peer->redirect_learned.a4 != rt->rt_gateway) {
1959 rt->rt_gateway = peer->redirect_learned.a4;
1960 rt->rt_flags |= RTCF_REDIRECTED;
1961 }
1962 } else {
1963 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1964 rt->fi = fi;
1965 atomic_inc(&fi->fib_clntref);
1966 }
1967 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1968 }
1969}
1970
1971static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1972 const struct fib_result *res,
1973 struct fib_info *fi, u16 type, u32 itag)
1974{
1975 struct dst_entry *dst = &rt->dst;
1976
1977 if (fi) {
1978 if (FIB_RES_GW(*res) &&
1979 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1980 rt->rt_gateway = FIB_RES_GW(*res);
1981 rt_init_metrics(rt, fl4, fi);
1982#ifdef CONFIG_IP_ROUTE_CLASSID
1983 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1984#endif
1985 }
1986
1987 if (dst_mtu(dst) > IP_MAX_MTU)
1988 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1989 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1990 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1991
1992#ifdef CONFIG_IP_ROUTE_CLASSID
1993#ifdef CONFIG_IP_MULTIPLE_TABLES
1994 set_class_tag(rt, fib_rules_tclass(res));
1995#endif
1996 set_class_tag(rt, itag);
1997#endif
1998}
1999
2000static struct rtable *rt_dst_alloc(struct net_device *dev,
2001 bool nopolicy, bool noxfrm)
2002{
2003 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2004 DST_HOST |
2005 (nopolicy ? DST_NOPOLICY : 0) |
2006 (noxfrm ? DST_NOXFRM : 0));
2007}
2008
2009
2010static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2011 u8 tos, struct net_device *dev, int our)
2012{
2013 unsigned int hash;
2014 struct rtable *rth;
2015 __be32 spec_dst;
2016 struct in_device *in_dev = __in_dev_get_rcu(dev);
2017 u32 itag = 0;
2018 int err;
2019
2020
2021
2022 if (in_dev == NULL)
2023 return -EINVAL;
2024
2025 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2026 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2027 goto e_inval;
2028
2029 if (ipv4_is_zeronet(saddr)) {
2030 if (!ipv4_is_local_multicast(daddr))
2031 goto e_inval;
2032 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2033 } else {
2034 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2035 &itag);
2036 if (err < 0)
2037 goto e_err;
2038 }
2039 rth = rt_dst_alloc(init_net.loopback_dev,
2040 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2041 if (!rth)
2042 goto e_nobufs;
2043
2044#ifdef CONFIG_IP_ROUTE_CLASSID
2045 rth->dst.tclassid = itag;
2046#endif
2047 rth->dst.output = ip_rt_bug;
2048
2049 rth->rt_key_dst = daddr;
2050 rth->rt_key_src = saddr;
2051 rth->rt_genid = rt_genid(dev_net(dev));
2052 rth->rt_flags = RTCF_MULTICAST;
2053 rth->rt_type = RTN_MULTICAST;
2054 rth->rt_key_tos = tos;
2055 rth->rt_dst = daddr;
2056 rth->rt_src = saddr;
2057 rth->rt_route_iif = dev->ifindex;
2058 rth->rt_iif = dev->ifindex;
2059 rth->rt_oif = 0;
2060 rth->rt_mark = skb->mark;
2061 rth->rt_gateway = daddr;
2062 rth->rt_spec_dst= spec_dst;
2063 rth->rt_peer_genid = 0;
2064 rth->peer = NULL;
2065 rth->fi = NULL;
2066 if (our) {
2067 rth->dst.input= ip_local_deliver;
2068 rth->rt_flags |= RTCF_LOCAL;
2069 }
2070
2071#ifdef CONFIG_IP_MROUTE
2072 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2073 rth->dst.input = ip_mr_input;
2074#endif
2075 RT_CACHE_STAT_INC(in_slow_mc);
2076
2077 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2078 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2079 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2080
2081e_nobufs:
2082 return -ENOBUFS;
2083e_inval:
2084 return -EINVAL;
2085e_err:
2086 return err;
2087}
2088
2089
2090static void ip_handle_martian_source(struct net_device *dev,
2091 struct in_device *in_dev,
2092 struct sk_buff *skb,
2093 __be32 daddr,
2094 __be32 saddr)
2095{
2096 RT_CACHE_STAT_INC(in_martian_src);
2097#ifdef CONFIG_IP_ROUTE_VERBOSE
2098 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2099
2100
2101
2102
2103 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2104 &daddr, &saddr, dev->name);
2105 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2106 int i;
2107 const unsigned char *p = skb_mac_header(skb);
2108 printk(KERN_WARNING "ll header: ");
2109 for (i = 0; i < dev->hard_header_len; i++, p++) {
2110 printk("%02x", *p);
2111 if (i < (dev->hard_header_len - 1))
2112 printk(":");
2113 }
2114 printk("\n");
2115 }
2116 }
2117#endif
2118}
2119
2120
2121static int __mkroute_input(struct sk_buff *skb,
2122 const struct fib_result *res,
2123 struct in_device *in_dev,
2124 __be32 daddr, __be32 saddr, u32 tos,
2125 struct rtable **result)
2126{
2127 struct rtable *rth;
2128 int err;
2129 struct in_device *out_dev;
2130 unsigned int flags = 0;
2131 __be32 spec_dst;
2132 u32 itag;
2133
2134
2135 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2136 if (out_dev == NULL) {
2137 if (net_ratelimit())
2138 printk(KERN_CRIT "Bug in ip_route_input" \
2139 "_slow(). Please, report\n");
2140 return -EINVAL;
2141 }
2142
2143
2144 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2145 in_dev->dev, &spec_dst, &itag);
2146 if (err < 0) {
2147 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2148 saddr);
2149
2150 goto cleanup;
2151 }
2152
2153 if (err)
2154 flags |= RTCF_DIRECTSRC;
2155
2156 if (out_dev == in_dev && err &&
2157 (IN_DEV_SHARED_MEDIA(out_dev) ||
2158 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2159 flags |= RTCF_DOREDIRECT;
2160
2161 if (skb->protocol != htons(ETH_P_IP)) {
2162
2163
2164
2165
2166
2167
2168
2169 if (out_dev == in_dev &&
2170 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2171 err = -EINVAL;
2172 goto cleanup;
2173 }
2174 }
2175
2176 rth = rt_dst_alloc(out_dev->dev,
2177 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2178 IN_DEV_CONF_GET(out_dev, NOXFRM));
2179 if (!rth) {
2180 err = -ENOBUFS;
2181 goto cleanup;
2182 }
2183
2184 rth->rt_key_dst = daddr;
2185 rth->rt_key_src = saddr;
2186 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2187 rth->rt_flags = flags;
2188 rth->rt_type = res->type;
2189 rth->rt_key_tos = tos;
2190 rth->rt_dst = daddr;
2191 rth->rt_src = saddr;
2192 rth->rt_route_iif = in_dev->dev->ifindex;
2193 rth->rt_iif = in_dev->dev->ifindex;
2194 rth->rt_oif = 0;
2195 rth->rt_mark = skb->mark;
2196 rth->rt_gateway = daddr;
2197 rth->rt_spec_dst= spec_dst;
2198 rth->rt_peer_genid = 0;
2199 rth->peer = NULL;
2200 rth->fi = NULL;
2201
2202 rth->dst.input = ip_forward;
2203 rth->dst.output = ip_output;
2204
2205 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2206
2207 *result = rth;
2208 err = 0;
2209 cleanup:
2210 return err;
2211}
2212
2213static int ip_mkroute_input(struct sk_buff *skb,
2214 struct fib_result *res,
2215 const struct flowi4 *fl4,
2216 struct in_device *in_dev,
2217 __be32 daddr, __be32 saddr, u32 tos)
2218{
2219 struct rtable* rth = NULL;
2220 int err;
2221 unsigned hash;
2222
2223#ifdef CONFIG_IP_ROUTE_MULTIPATH
2224 if (res->fi && res->fi->fib_nhs > 1)
2225 fib_select_multipath(res);
2226#endif
2227
2228
2229 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2230 if (err)
2231 return err;
2232
2233
2234 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2235 rt_genid(dev_net(rth->dst.dev)));
2236 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2237 if (IS_ERR(rth))
2238 return PTR_ERR(rth);
2239 return 0;
2240}
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2254 u8 tos, struct net_device *dev)
2255{
2256 struct fib_result res;
2257 struct in_device *in_dev = __in_dev_get_rcu(dev);
2258 struct flowi4 fl4;
2259 unsigned flags = 0;
2260 u32 itag = 0;
2261 struct rtable * rth;
2262 unsigned hash;
2263 __be32 spec_dst;
2264 int err = -EINVAL;
2265 struct net * net = dev_net(dev);
2266
2267
2268
2269 if (!in_dev)
2270 goto out;
2271
2272
2273
2274
2275
2276 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2277 ipv4_is_loopback(saddr))
2278 goto martian_source;
2279
2280 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2281 goto brd_input;
2282
2283
2284
2285
2286 if (ipv4_is_zeronet(saddr))
2287 goto martian_source;
2288
2289 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2290 goto martian_destination;
2291
2292
2293
2294
2295 fl4.flowi4_oif = 0;
2296 fl4.flowi4_iif = dev->ifindex;
2297 fl4.flowi4_mark = skb->mark;
2298 fl4.flowi4_tos = tos;
2299 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2300 fl4.daddr = daddr;
2301 fl4.saddr = saddr;
2302 err = fib_lookup(net, &fl4, &res);
2303 if (err != 0) {
2304 if (!IN_DEV_FORWARD(in_dev))
2305 goto e_hostunreach;
2306 goto no_route;
2307 }
2308
2309 RT_CACHE_STAT_INC(in_slow_tot);
2310
2311 if (res.type == RTN_BROADCAST)
2312 goto brd_input;
2313
2314 if (res.type == RTN_LOCAL) {
2315 err = fib_validate_source(skb, saddr, daddr, tos,
2316 net->loopback_dev->ifindex,
2317 dev, &spec_dst, &itag);
2318 if (err < 0)
2319 goto martian_source_keep_err;
2320 if (err)
2321 flags |= RTCF_DIRECTSRC;
2322 spec_dst = daddr;
2323 goto local_input;
2324 }
2325
2326 if (!IN_DEV_FORWARD(in_dev))
2327 goto e_hostunreach;
2328 if (res.type != RTN_UNICAST)
2329 goto martian_destination;
2330
2331 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2332out: return err;
2333
2334brd_input:
2335 if (skb->protocol != htons(ETH_P_IP))
2336 goto e_inval;
2337
2338 if (ipv4_is_zeronet(saddr))
2339 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2340 else {
2341 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2342 &itag);
2343 if (err < 0)
2344 goto martian_source_keep_err;
2345 if (err)
2346 flags |= RTCF_DIRECTSRC;
2347 }
2348 flags |= RTCF_BROADCAST;
2349 res.type = RTN_BROADCAST;
2350 RT_CACHE_STAT_INC(in_brd);
2351
2352local_input:
2353 rth = rt_dst_alloc(net->loopback_dev,
2354 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2355 if (!rth)
2356 goto e_nobufs;
2357
2358 rth->dst.input= ip_local_deliver;
2359 rth->dst.output= ip_rt_bug;
2360#ifdef CONFIG_IP_ROUTE_CLASSID
2361 rth->dst.tclassid = itag;
2362#endif
2363
2364 rth->rt_key_dst = daddr;
2365 rth->rt_key_src = saddr;
2366 rth->rt_genid = rt_genid(net);
2367 rth->rt_flags = flags|RTCF_LOCAL;
2368 rth->rt_type = res.type;
2369 rth->rt_key_tos = tos;
2370 rth->rt_dst = daddr;
2371 rth->rt_src = saddr;
2372#ifdef CONFIG_IP_ROUTE_CLASSID
2373 rth->dst.tclassid = itag;
2374#endif
2375 rth->rt_route_iif = dev->ifindex;
2376 rth->rt_iif = dev->ifindex;
2377 rth->rt_oif = 0;
2378 rth->rt_mark = skb->mark;
2379 rth->rt_gateway = daddr;
2380 rth->rt_spec_dst= spec_dst;
2381 rth->rt_peer_genid = 0;
2382 rth->peer = NULL;
2383 rth->fi = NULL;
2384 if (res.type == RTN_UNREACHABLE) {
2385 rth->dst.input= ip_error;
2386 rth->dst.error= -err;
2387 rth->rt_flags &= ~RTCF_LOCAL;
2388 }
2389 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2390 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2391 err = 0;
2392 if (IS_ERR(rth))
2393 err = PTR_ERR(rth);
2394 goto out;
2395
2396no_route:
2397 RT_CACHE_STAT_INC(in_no_route);
2398 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2399 res.type = RTN_UNREACHABLE;
2400 if (err == -ESRCH)
2401 err = -ENETUNREACH;
2402 goto local_input;
2403
2404
2405
2406
2407martian_destination:
2408 RT_CACHE_STAT_INC(in_martian_dst);
2409#ifdef CONFIG_IP_ROUTE_VERBOSE
2410 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2411 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2412 &daddr, &saddr, dev->name);
2413#endif
2414
2415e_hostunreach:
2416 err = -EHOSTUNREACH;
2417 goto out;
2418
2419e_inval:
2420 err = -EINVAL;
2421 goto out;
2422
2423e_nobufs:
2424 err = -ENOBUFS;
2425 goto out;
2426
2427martian_source:
2428 err = -EINVAL;
2429martian_source_keep_err:
2430 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2431 goto out;
2432}
2433
2434int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2435 u8 tos, struct net_device *dev, bool noref)
2436{
2437 struct rtable * rth;
2438 unsigned hash;
2439 int iif = dev->ifindex;
2440 struct net *net;
2441 int res;
2442
2443 net = dev_net(dev);
2444
2445 rcu_read_lock();
2446
2447 if (!rt_caching(net))
2448 goto skip_cache;
2449
2450 tos &= IPTOS_RT_MASK;
2451 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2452
2453 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2454 rth = rcu_dereference(rth->dst.rt_next)) {
2455 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2456 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2457 (rth->rt_route_iif ^ iif) |
2458 (rth->rt_key_tos ^ tos)) == 0 &&
2459 rth->rt_mark == skb->mark &&
2460 net_eq(dev_net(rth->dst.dev), net) &&
2461 !rt_is_expired(rth)) {
2462 ipv4_validate_peer(rth);
2463 if (noref) {
2464 dst_use_noref(&rth->dst, jiffies);
2465 skb_dst_set_noref(skb, &rth->dst);
2466 } else {
2467 dst_use(&rth->dst, jiffies);
2468 skb_dst_set(skb, &rth->dst);
2469 }
2470 RT_CACHE_STAT_INC(in_hit);
2471 rcu_read_unlock();
2472 return 0;
2473 }
2474 RT_CACHE_STAT_INC(in_hlist_search);
2475 }
2476
2477skip_cache:
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489 if (ipv4_is_multicast(daddr)) {
2490 struct in_device *in_dev = __in_dev_get_rcu(dev);
2491
2492 if (in_dev) {
2493 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2494 ip_hdr(skb)->protocol);
2495 if (our
2496#ifdef CONFIG_IP_MROUTE
2497 ||
2498 (!ipv4_is_local_multicast(daddr) &&
2499 IN_DEV_MFORWARD(in_dev))
2500#endif
2501 ) {
2502 int res = ip_route_input_mc(skb, daddr, saddr,
2503 tos, dev, our);
2504 rcu_read_unlock();
2505 return res;
2506 }
2507 }
2508 rcu_read_unlock();
2509 return -EINVAL;
2510 }
2511 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2512 rcu_read_unlock();
2513 return res;
2514}
2515EXPORT_SYMBOL(ip_route_input_common);
2516
2517
2518static struct rtable *__mkroute_output(const struct fib_result *res,
2519 const struct flowi4 *fl4,
2520 __be32 orig_daddr, __be32 orig_saddr,
2521 int orig_oif, __u8 orig_rtos,
2522 struct net_device *dev_out,
2523 unsigned int flags)
2524{
2525 struct fib_info *fi = res->fi;
2526 struct in_device *in_dev;
2527 u16 type = res->type;
2528 struct rtable *rth;
2529
2530 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2531 return ERR_PTR(-EINVAL);
2532
2533 if (ipv4_is_lbcast(fl4->daddr))
2534 type = RTN_BROADCAST;
2535 else if (ipv4_is_multicast(fl4->daddr))
2536 type = RTN_MULTICAST;
2537 else if (ipv4_is_zeronet(fl4->daddr))
2538 return ERR_PTR(-EINVAL);
2539
2540 if (dev_out->flags & IFF_LOOPBACK)
2541 flags |= RTCF_LOCAL;
2542
2543 in_dev = __in_dev_get_rcu(dev_out);
2544 if (!in_dev)
2545 return ERR_PTR(-EINVAL);
2546
2547 if (type == RTN_BROADCAST) {
2548 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2549 fi = NULL;
2550 } else if (type == RTN_MULTICAST) {
2551 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2552 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2553 fl4->flowi4_proto))
2554 flags &= ~RTCF_LOCAL;
2555
2556
2557
2558
2559 if (fi && res->prefixlen < 4)
2560 fi = NULL;
2561 }
2562
2563 rth = rt_dst_alloc(dev_out,
2564 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2565 IN_DEV_CONF_GET(in_dev, NOXFRM));
2566 if (!rth)
2567 return ERR_PTR(-ENOBUFS);
2568
2569 rth->dst.output = ip_output;
2570
2571 rth->rt_key_dst = orig_daddr;
2572 rth->rt_key_src = orig_saddr;
2573 rth->rt_genid = rt_genid(dev_net(dev_out));
2574 rth->rt_flags = flags;
2575 rth->rt_type = type;
2576 rth->rt_key_tos = orig_rtos;
2577 rth->rt_dst = fl4->daddr;
2578 rth->rt_src = fl4->saddr;
2579 rth->rt_route_iif = 0;
2580 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2581 rth->rt_oif = orig_oif;
2582 rth->rt_mark = fl4->flowi4_mark;
2583 rth->rt_gateway = fl4->daddr;
2584 rth->rt_spec_dst= fl4->saddr;
2585 rth->rt_peer_genid = 0;
2586 rth->peer = NULL;
2587 rth->fi = NULL;
2588
2589 RT_CACHE_STAT_INC(out_slow_tot);
2590
2591 if (flags & RTCF_LOCAL) {
2592 rth->dst.input = ip_local_deliver;
2593 rth->rt_spec_dst = fl4->daddr;
2594 }
2595 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2596 rth->rt_spec_dst = fl4->saddr;
2597 if (flags & RTCF_LOCAL &&
2598 !(dev_out->flags & IFF_LOOPBACK)) {
2599 rth->dst.output = ip_mc_output;
2600 RT_CACHE_STAT_INC(out_slow_mc);
2601 }
2602#ifdef CONFIG_IP_MROUTE
2603 if (type == RTN_MULTICAST) {
2604 if (IN_DEV_MFORWARD(in_dev) &&
2605 !ipv4_is_local_multicast(fl4->daddr)) {
2606 rth->dst.input = ip_mr_input;
2607 rth->dst.output = ip_mc_output;
2608 }
2609 }
2610#endif
2611 }
2612
2613 rt_set_nexthop(rth, fl4, res, fi, type, 0);
2614
2615 return rth;
2616}
2617
2618
2619
2620
2621
2622
2623static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2624{
2625 struct net_device *dev_out = NULL;
2626 __u8 tos = RT_FL_TOS(fl4);
2627 unsigned int flags = 0;
2628 struct fib_result res;
2629 struct rtable *rth;
2630 __be32 orig_daddr;
2631 __be32 orig_saddr;
2632 int orig_oif;
2633
2634 res.fi = NULL;
2635#ifdef CONFIG_IP_MULTIPLE_TABLES
2636 res.r = NULL;
2637#endif
2638
2639 orig_daddr = fl4->daddr;
2640 orig_saddr = fl4->saddr;
2641 orig_oif = fl4->flowi4_oif;
2642
2643 fl4->flowi4_iif = net->loopback_dev->ifindex;
2644 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2645 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2646 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2647
2648 rcu_read_lock();
2649 if (fl4->saddr) {
2650 rth = ERR_PTR(-EINVAL);
2651 if (ipv4_is_multicast(fl4->saddr) ||
2652 ipv4_is_lbcast(fl4->saddr) ||
2653 ipv4_is_zeronet(fl4->saddr))
2654 goto out;
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664 if (fl4->flowi4_oif == 0 &&
2665 (ipv4_is_multicast(fl4->daddr) ||
2666 ipv4_is_lbcast(fl4->daddr))) {
2667
2668 dev_out = __ip_dev_find(net, fl4->saddr, false);
2669 if (dev_out == NULL)
2670 goto out;
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687 fl4->flowi4_oif = dev_out->ifindex;
2688 goto make_route;
2689 }
2690
2691 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2692
2693 if (!__ip_dev_find(net, fl4->saddr, false))
2694 goto out;
2695 }
2696 }
2697
2698
2699 if (fl4->flowi4_oif) {
2700 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2701 rth = ERR_PTR(-ENODEV);
2702 if (dev_out == NULL)
2703 goto out;
2704
2705
2706 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2707 rth = ERR_PTR(-ENETUNREACH);
2708 goto out;
2709 }
2710 if (ipv4_is_local_multicast(fl4->daddr) ||
2711 ipv4_is_lbcast(fl4->daddr)) {
2712 if (!fl4->saddr)
2713 fl4->saddr = inet_select_addr(dev_out, 0,
2714 RT_SCOPE_LINK);
2715 goto make_route;
2716 }
2717 if (fl4->saddr) {
2718 if (ipv4_is_multicast(fl4->daddr))
2719 fl4->saddr = inet_select_addr(dev_out, 0,
2720 fl4->flowi4_scope);
2721 else if (!fl4->daddr)
2722 fl4->saddr = inet_select_addr(dev_out, 0,
2723 RT_SCOPE_HOST);
2724 }
2725 }
2726
2727 if (!fl4->daddr) {
2728 fl4->daddr = fl4->saddr;
2729 if (!fl4->daddr)
2730 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2731 dev_out = net->loopback_dev;
2732 fl4->flowi4_oif = net->loopback_dev->ifindex;
2733 res.type = RTN_LOCAL;
2734 flags |= RTCF_LOCAL;
2735 goto make_route;
2736 }
2737
2738 if (fib_lookup(net, fl4, &res)) {
2739 res.fi = NULL;
2740 if (fl4->flowi4_oif) {
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759 if (fl4->saddr == 0)
2760 fl4->saddr = inet_select_addr(dev_out, 0,
2761 RT_SCOPE_LINK);
2762 res.type = RTN_UNICAST;
2763 goto make_route;
2764 }
2765 rth = ERR_PTR(-ENETUNREACH);
2766 goto out;
2767 }
2768
2769 if (res.type == RTN_LOCAL) {
2770 if (!fl4->saddr) {
2771 if (res.fi->fib_prefsrc)
2772 fl4->saddr = res.fi->fib_prefsrc;
2773 else
2774 fl4->saddr = fl4->daddr;
2775 }
2776 dev_out = net->loopback_dev;
2777 fl4->flowi4_oif = dev_out->ifindex;
2778 res.fi = NULL;
2779 flags |= RTCF_LOCAL;
2780 goto make_route;
2781 }
2782
2783#ifdef CONFIG_IP_ROUTE_MULTIPATH
2784 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2785 fib_select_multipath(&res);
2786 else
2787#endif
2788 if (!res.prefixlen &&
2789 res.table->tb_num_default > 1 &&
2790 res.type == RTN_UNICAST && !fl4->flowi4_oif)
2791 fib_select_default(&res);
2792
2793 if (!fl4->saddr)
2794 fl4->saddr = FIB_RES_PREFSRC(net, res);
2795
2796 dev_out = FIB_RES_DEV(res);
2797 fl4->flowi4_oif = dev_out->ifindex;
2798
2799
2800make_route:
2801 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2802 tos, dev_out, flags);
2803 if (!IS_ERR(rth)) {
2804 unsigned int hash;
2805
2806 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2807 rt_genid(dev_net(dev_out)));
2808 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2809 }
2810
2811out:
2812 rcu_read_unlock();
2813 return rth;
2814}
2815
2816struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2817{
2818 struct rtable *rth;
2819 unsigned int hash;
2820
2821 if (!rt_caching(net))
2822 goto slow_output;
2823
2824 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2825
2826 rcu_read_lock_bh();
2827 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2828 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2829 if (rth->rt_key_dst == flp4->daddr &&
2830 rth->rt_key_src == flp4->saddr &&
2831 rt_is_output_route(rth) &&
2832 rth->rt_oif == flp4->flowi4_oif &&
2833 rth->rt_mark == flp4->flowi4_mark &&
2834 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2835 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2836 net_eq(dev_net(rth->dst.dev), net) &&
2837 !rt_is_expired(rth)) {
2838 ipv4_validate_peer(rth);
2839 dst_use(&rth->dst, jiffies);
2840 RT_CACHE_STAT_INC(out_hit);
2841 rcu_read_unlock_bh();
2842 if (!flp4->saddr)
2843 flp4->saddr = rth->rt_src;
2844 if (!flp4->daddr)
2845 flp4->daddr = rth->rt_dst;
2846 return rth;
2847 }
2848 RT_CACHE_STAT_INC(out_hlist_search);
2849 }
2850 rcu_read_unlock_bh();
2851
2852slow_output:
2853 return ip_route_output_slow(net, flp4);
2854}
2855EXPORT_SYMBOL_GPL(__ip_route_output_key);
2856
2857static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2858{
2859 return NULL;
2860}
2861
2862static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2863{
2864 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2865
2866 return mtu ? : dst->dev->mtu;
2867}
2868
2869static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2870{
2871}
2872
2873static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2874 unsigned long old)
2875{
2876 return NULL;
2877}
2878
2879static struct dst_ops ipv4_dst_blackhole_ops = {
2880 .family = AF_INET,
2881 .protocol = cpu_to_be16(ETH_P_IP),
2882 .destroy = ipv4_dst_destroy,
2883 .check = ipv4_blackhole_dst_check,
2884 .mtu = ipv4_blackhole_mtu,
2885 .default_advmss = ipv4_default_advmss,
2886 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2887 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2888 .neigh_lookup = ipv4_neigh_lookup,
2889};
2890
2891struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2892{
2893 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2894 struct rtable *ort = (struct rtable *) dst_orig;
2895
2896 if (rt) {
2897 struct dst_entry *new = &rt->dst;
2898
2899 new->__use = 1;
2900 new->input = dst_discard;
2901 new->output = dst_discard;
2902 dst_copy_metrics(new, &ort->dst);
2903
2904 new->dev = ort->dst.dev;
2905 if (new->dev)
2906 dev_hold(new->dev);
2907
2908 rt->rt_key_dst = ort->rt_key_dst;
2909 rt->rt_key_src = ort->rt_key_src;
2910 rt->rt_key_tos = ort->rt_key_tos;
2911 rt->rt_route_iif = ort->rt_route_iif;
2912 rt->rt_iif = ort->rt_iif;
2913 rt->rt_oif = ort->rt_oif;
2914 rt->rt_mark = ort->rt_mark;
2915
2916 rt->rt_genid = rt_genid(net);
2917 rt->rt_flags = ort->rt_flags;
2918 rt->rt_type = ort->rt_type;
2919 rt->rt_dst = ort->rt_dst;
2920 rt->rt_src = ort->rt_src;
2921 rt->rt_gateway = ort->rt_gateway;
2922 rt->rt_spec_dst = ort->rt_spec_dst;
2923 rt->peer = ort->peer;
2924 if (rt->peer)
2925 atomic_inc(&rt->peer->refcnt);
2926 rt->fi = ort->fi;
2927 if (rt->fi)
2928 atomic_inc(&rt->fi->fib_clntref);
2929
2930 dst_free(new);
2931 }
2932
2933 dst_release(dst_orig);
2934
2935 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2936}
2937
2938struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2939 struct sock *sk)
2940{
2941 struct rtable *rt = __ip_route_output_key(net, flp4);
2942
2943 if (IS_ERR(rt))
2944 return rt;
2945
2946 if (flp4->flowi4_proto)
2947 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2948 flowi4_to_flowi(flp4),
2949 sk, 0);
2950
2951 return rt;
2952}
2953EXPORT_SYMBOL_GPL(ip_route_output_flow);
2954
2955static int rt_fill_info(struct net *net,
2956 struct sk_buff *skb, u32 pid, u32 seq, int event,
2957 int nowait, unsigned int flags)
2958{
2959 struct rtable *rt = skb_rtable(skb);
2960 struct rtmsg *r;
2961 struct nlmsghdr *nlh;
2962 unsigned long expires = 0;
2963 const struct inet_peer *peer = rt->peer;
2964 u32 id = 0, ts = 0, tsage = 0, error;
2965
2966 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2967 if (nlh == NULL)
2968 return -EMSGSIZE;
2969
2970 r = nlmsg_data(nlh);
2971 r->rtm_family = AF_INET;
2972 r->rtm_dst_len = 32;
2973 r->rtm_src_len = 0;
2974 r->rtm_tos = rt->rt_key_tos;
2975 r->rtm_table = RT_TABLE_MAIN;
2976 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2977 r->rtm_type = rt->rt_type;
2978 r->rtm_scope = RT_SCOPE_UNIVERSE;
2979 r->rtm_protocol = RTPROT_UNSPEC;
2980 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2981 if (rt->rt_flags & RTCF_NOTIFY)
2982 r->rtm_flags |= RTM_F_NOTIFY;
2983
2984 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2985
2986 if (rt->rt_key_src) {
2987 r->rtm_src_len = 32;
2988 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2989 }
2990 if (rt->dst.dev)
2991 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2992#ifdef CONFIG_IP_ROUTE_CLASSID
2993 if (rt->dst.tclassid)
2994 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2995#endif
2996 if (rt_is_input_route(rt))
2997 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2998 else if (rt->rt_src != rt->rt_key_src)
2999 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
3000
3001 if (rt->rt_dst != rt->rt_gateway)
3002 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
3003
3004 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3005 goto nla_put_failure;
3006
3007 if (rt->rt_mark)
3008 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
3009
3010 error = rt->dst.error;
3011 if (peer) {
3012 inet_peer_refcheck(rt->peer);
3013 id = atomic_read(&peer->ip_id_count) & 0xffff;
3014 if (peer->tcp_ts_stamp) {
3015 ts = peer->tcp_ts;
3016 tsage = get_seconds() - peer->tcp_ts_stamp;
3017 }
3018 expires = ACCESS_ONCE(peer->pmtu_expires);
3019 if (expires) {
3020 if (time_before(jiffies, expires))
3021 expires -= jiffies;
3022 else
3023 expires = 0;
3024 }
3025 }
3026
3027 if (rt_is_input_route(rt)) {
3028#ifdef CONFIG_IP_MROUTE
3029 __be32 dst = rt->rt_dst;
3030
3031 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3032 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3033 int err = ipmr_get_route(net, skb,
3034 rt->rt_src, rt->rt_dst,
3035 r, nowait);
3036 if (err <= 0) {
3037 if (!nowait) {
3038 if (err == 0)
3039 return 0;
3040 goto nla_put_failure;
3041 } else {
3042 if (err == -EMSGSIZE)
3043 goto nla_put_failure;
3044 error = err;
3045 }
3046 }
3047 } else
3048#endif
3049 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
3050 }
3051
3052 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3053 expires, error) < 0)
3054 goto nla_put_failure;
3055
3056 return nlmsg_end(skb, nlh);
3057
3058nla_put_failure:
3059 nlmsg_cancel(skb, nlh);
3060 return -EMSGSIZE;
3061}
3062
3063static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
3064{
3065 struct net *net = sock_net(in_skb->sk);
3066 struct rtmsg *rtm;
3067 struct nlattr *tb[RTA_MAX+1];
3068 struct rtable *rt = NULL;
3069 __be32 dst = 0;
3070 __be32 src = 0;
3071 u32 iif;
3072 int err;
3073 int mark;
3074 struct sk_buff *skb;
3075
3076 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3077 if (err < 0)
3078 goto errout;
3079
3080 rtm = nlmsg_data(nlh);
3081
3082 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3083 if (skb == NULL) {
3084 err = -ENOBUFS;
3085 goto errout;
3086 }
3087
3088
3089
3090
3091 skb_reset_mac_header(skb);
3092 skb_reset_network_header(skb);
3093
3094
3095 ip_hdr(skb)->protocol = IPPROTO_ICMP;
3096 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3097
3098 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3099 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3100 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3101 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3102
3103 if (iif) {
3104 struct net_device *dev;
3105
3106 dev = __dev_get_by_index(net, iif);
3107 if (dev == NULL) {
3108 err = -ENODEV;
3109 goto errout_free;
3110 }
3111
3112 skb->protocol = htons(ETH_P_IP);
3113 skb->dev = dev;
3114 skb->mark = mark;
3115 local_bh_disable();
3116 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3117 local_bh_enable();
3118
3119 rt = skb_rtable(skb);
3120 if (err == 0 && rt->dst.error)
3121 err = -rt->dst.error;
3122 } else {
3123 struct flowi4 fl4 = {
3124 .daddr = dst,
3125 .saddr = src,
3126 .flowi4_tos = rtm->rtm_tos,
3127 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3128 .flowi4_mark = mark,
3129 };
3130 rt = ip_route_output_key(net, &fl4);
3131
3132 err = 0;
3133 if (IS_ERR(rt))
3134 err = PTR_ERR(rt);
3135 }
3136
3137 if (err)
3138 goto errout_free;
3139
3140 skb_dst_set(skb, &rt->dst);
3141 if (rtm->rtm_flags & RTM_F_NOTIFY)
3142 rt->rt_flags |= RTCF_NOTIFY;
3143
3144 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3145 RTM_NEWROUTE, 0, 0);
3146 if (err <= 0)
3147 goto errout_free;
3148
3149 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3150errout:
3151 return err;
3152
3153errout_free:
3154 kfree_skb(skb);
3155 goto errout;
3156}
3157
3158int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3159{
3160 struct rtable *rt;
3161 int h, s_h;
3162 int idx, s_idx;
3163 struct net *net;
3164
3165 net = sock_net(skb->sk);
3166
3167 s_h = cb->args[0];
3168 if (s_h < 0)
3169 s_h = 0;
3170 s_idx = idx = cb->args[1];
3171 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3172 if (!rt_hash_table[h].chain)
3173 continue;
3174 rcu_read_lock_bh();
3175 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3176 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3177 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3178 continue;
3179 if (rt_is_expired(rt))
3180 continue;
3181 skb_dst_set_noref(skb, &rt->dst);
3182 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3183 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3184 1, NLM_F_MULTI) <= 0) {
3185 skb_dst_drop(skb);
3186 rcu_read_unlock_bh();
3187 goto done;
3188 }
3189 skb_dst_drop(skb);
3190 }
3191 rcu_read_unlock_bh();
3192 }
3193
3194done:
3195 cb->args[0] = h;
3196 cb->args[1] = idx;
3197 return skb->len;
3198}
3199
3200void ip_rt_multicast_event(struct in_device *in_dev)
3201{
3202 rt_cache_flush(dev_net(in_dev->dev), 0);
3203}
3204
3205#ifdef CONFIG_SYSCTL
3206static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3207 void __user *buffer,
3208 size_t *lenp, loff_t *ppos)
3209{
3210 if (write) {
3211 int flush_delay;
3212 ctl_table ctl;
3213 struct net *net;
3214
3215 memcpy(&ctl, __ctl, sizeof(ctl));
3216 ctl.data = &flush_delay;
3217 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3218
3219 net = (struct net *)__ctl->extra1;
3220 rt_cache_flush(net, flush_delay);
3221 return 0;
3222 }
3223
3224 return -EINVAL;
3225}
3226
3227static ctl_table ipv4_route_table[] = {
3228 {
3229 .procname = "gc_thresh",
3230 .data = &ipv4_dst_ops.gc_thresh,
3231 .maxlen = sizeof(int),
3232 .mode = 0644,
3233 .proc_handler = proc_dointvec,
3234 },
3235 {
3236 .procname = "max_size",
3237 .data = &ip_rt_max_size,
3238 .maxlen = sizeof(int),
3239 .mode = 0644,
3240 .proc_handler = proc_dointvec,
3241 },
3242 {
3243
3244
3245 .procname = "gc_min_interval",
3246 .data = &ip_rt_gc_min_interval,
3247 .maxlen = sizeof(int),
3248 .mode = 0644,
3249 .proc_handler = proc_dointvec_jiffies,
3250 },
3251 {
3252 .procname = "gc_min_interval_ms",
3253 .data = &ip_rt_gc_min_interval,
3254 .maxlen = sizeof(int),
3255 .mode = 0644,
3256 .proc_handler = proc_dointvec_ms_jiffies,
3257 },
3258 {
3259 .procname = "gc_timeout",
3260 .data = &ip_rt_gc_timeout,
3261 .maxlen = sizeof(int),
3262 .mode = 0644,
3263 .proc_handler = proc_dointvec_jiffies,
3264 },
3265 {
3266 .procname = "gc_interval",
3267 .data = &ip_rt_gc_interval,
3268 .maxlen = sizeof(int),
3269 .mode = 0644,
3270 .proc_handler = proc_dointvec_jiffies,
3271 },
3272 {
3273 .procname = "redirect_load",
3274 .data = &ip_rt_redirect_load,
3275 .maxlen = sizeof(int),
3276 .mode = 0644,
3277 .proc_handler = proc_dointvec,
3278 },
3279 {
3280 .procname = "redirect_number",
3281 .data = &ip_rt_redirect_number,
3282 .maxlen = sizeof(int),
3283 .mode = 0644,
3284 .proc_handler = proc_dointvec,
3285 },
3286 {
3287 .procname = "redirect_silence",
3288 .data = &ip_rt_redirect_silence,
3289 .maxlen = sizeof(int),
3290 .mode = 0644,
3291 .proc_handler = proc_dointvec,
3292 },
3293 {
3294 .procname = "error_cost",
3295 .data = &ip_rt_error_cost,
3296 .maxlen = sizeof(int),
3297 .mode = 0644,
3298 .proc_handler = proc_dointvec,
3299 },
3300 {
3301 .procname = "error_burst",
3302 .data = &ip_rt_error_burst,
3303 .maxlen = sizeof(int),
3304 .mode = 0644,
3305 .proc_handler = proc_dointvec,
3306 },
3307 {
3308 .procname = "gc_elasticity",
3309 .data = &ip_rt_gc_elasticity,
3310 .maxlen = sizeof(int),
3311 .mode = 0644,
3312 .proc_handler = proc_dointvec,
3313 },
3314 {
3315 .procname = "mtu_expires",
3316 .data = &ip_rt_mtu_expires,
3317 .maxlen = sizeof(int),
3318 .mode = 0644,
3319 .proc_handler = proc_dointvec_jiffies,
3320 },
3321 {
3322 .procname = "min_pmtu",
3323 .data = &ip_rt_min_pmtu,
3324 .maxlen = sizeof(int),
3325 .mode = 0644,
3326 .proc_handler = proc_dointvec,
3327 },
3328 {
3329 .procname = "min_adv_mss",
3330 .data = &ip_rt_min_advmss,
3331 .maxlen = sizeof(int),
3332 .mode = 0644,
3333 .proc_handler = proc_dointvec,
3334 },
3335 { }
3336};
3337
3338static struct ctl_table empty[1];
3339
3340static struct ctl_table ipv4_skeleton[] =
3341{
3342 { .procname = "route",
3343 .mode = 0555, .child = ipv4_route_table},
3344 { .procname = "neigh",
3345 .mode = 0555, .child = empty},
3346 { }
3347};
3348
3349static __net_initdata struct ctl_path ipv4_path[] = {
3350 { .procname = "net", },
3351 { .procname = "ipv4", },
3352 { },
3353};
3354
3355static struct ctl_table ipv4_route_flush_table[] = {
3356 {
3357 .procname = "flush",
3358 .maxlen = sizeof(int),
3359 .mode = 0200,
3360 .proc_handler = ipv4_sysctl_rtcache_flush,
3361 },
3362 { },
3363};
3364
3365static __net_initdata struct ctl_path ipv4_route_path[] = {
3366 { .procname = "net", },
3367 { .procname = "ipv4", },
3368 { .procname = "route", },
3369 { },
3370};
3371
3372static __net_init int sysctl_route_net_init(struct net *net)
3373{
3374 struct ctl_table *tbl;
3375
3376 tbl = ipv4_route_flush_table;
3377 if (!net_eq(net, &init_net)) {
3378 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3379 if (tbl == NULL)
3380 goto err_dup;
3381 }
3382 tbl[0].extra1 = net;
3383
3384 net->ipv4.route_hdr =
3385 register_net_sysctl_table(net, ipv4_route_path, tbl);
3386 if (net->ipv4.route_hdr == NULL)
3387 goto err_reg;
3388 return 0;
3389
3390err_reg:
3391 if (tbl != ipv4_route_flush_table)
3392 kfree(tbl);
3393err_dup:
3394 return -ENOMEM;
3395}
3396
3397static __net_exit void sysctl_route_net_exit(struct net *net)
3398{
3399 struct ctl_table *tbl;
3400
3401 tbl = net->ipv4.route_hdr->ctl_table_arg;
3402 unregister_net_sysctl_table(net->ipv4.route_hdr);
3403 BUG_ON(tbl == ipv4_route_flush_table);
3404 kfree(tbl);
3405}
3406
3407static __net_initdata struct pernet_operations sysctl_route_ops = {
3408 .init = sysctl_route_net_init,
3409 .exit = sysctl_route_net_exit,
3410};
3411#endif
3412
3413static __net_init int rt_genid_init(struct net *net)
3414{
3415 get_random_bytes(&net->ipv4.rt_genid,
3416 sizeof(net->ipv4.rt_genid));
3417 get_random_bytes(&net->ipv4.dev_addr_genid,
3418 sizeof(net->ipv4.dev_addr_genid));
3419 return 0;
3420}
3421
3422static __net_initdata struct pernet_operations rt_genid_ops = {
3423 .init = rt_genid_init,
3424};
3425
3426
3427#ifdef CONFIG_IP_ROUTE_CLASSID
3428struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3429#endif
3430
3431static __initdata unsigned long rhash_entries;
3432static int __init set_rhash_entries(char *str)
3433{
3434 if (!str)
3435 return 0;
3436 rhash_entries = simple_strtoul(str, &str, 0);
3437 return 1;
3438}
3439__setup("rhash_entries=", set_rhash_entries);
3440
3441int __init ip_rt_init(void)
3442{
3443 int rc = 0;
3444
3445#ifdef CONFIG_IP_ROUTE_CLASSID
3446 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3447 if (!ip_rt_acct)
3448 panic("IP: failed to allocate ip_rt_acct\n");
3449#endif
3450
3451 ipv4_dst_ops.kmem_cachep =
3452 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3453 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3454
3455 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3456
3457 if (dst_entries_init(&ipv4_dst_ops) < 0)
3458 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3459
3460 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3461 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3462
3463 rt_hash_table = (struct rt_hash_bucket *)
3464 alloc_large_system_hash("IP route cache",
3465 sizeof(struct rt_hash_bucket),
3466 rhash_entries,
3467 (totalram_pages >= 128 * 1024) ?
3468 15 : 17,
3469 0,
3470 &rt_hash_log,
3471 &rt_hash_mask,
3472 rhash_entries ? 0 : 512 * 1024);
3473 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3474 rt_hash_lock_init();
3475
3476 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3477 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3478
3479 devinet_init();
3480 ip_fib_init();
3481
3482 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3483 expires_ljiffies = jiffies;
3484 schedule_delayed_work(&expires_work,
3485 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3486
3487 if (ip_rt_proc_init())
3488 printk(KERN_ERR "Unable to create route proc files\n");
3489#ifdef CONFIG_XFRM
3490 xfrm_init();
3491 xfrm4_init(ip_rt_max_size);
3492#endif
3493 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3494
3495#ifdef CONFIG_SYSCTL
3496 register_pernet_subsys(&sysctl_route_ops);
3497#endif
3498 register_pernet_subsys(&rt_genid_ops);
3499 return rc;
3500}
3501
3502#ifdef CONFIG_SYSCTL
3503
3504
3505
3506
3507void __init ip_static_sysctl_init(void)
3508{
3509 register_sysctl_paths(ipv4_path, ipv4_skeleton);
3510}
3511#endif
3512