1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
71#include <linux/mm.h>
72#include <linux/bootmem.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/workqueue.h>
83#include <linux/skbuff.h>
84#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
93#include <net/dst.h>
94#include <net/net_namespace.h>
95#include <net/protocol.h>
96#include <net/ip.h>
97#include <net/route.h>
98#include <net/inetpeer.h>
99#include <net/sock.h>
100#include <net/ip_fib.h>
101#include <net/arp.h>
102#include <net/tcp.h>
103#include <net/icmp.h>
104#include <net/xfrm.h>
105#include <net/netevent.h>
106#include <net/rtnetlink.h>
107#ifdef CONFIG_SYSCTL
108#include <linux/sysctl.h>
109#endif
110
111#define RT_FL_TOS(oldflp) \
112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113
114#define IP_MAX_MTU 0xFFF0
115
116#define RT_GC_TIMEOUT (300*HZ)
117
118static int ip_rt_max_size;
119static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
120static int ip_rt_gc_interval __read_mostly = 60 * HZ;
121static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
122static int ip_rt_redirect_number __read_mostly = 9;
123static int ip_rt_redirect_load __read_mostly = HZ / 50;
124static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125static int ip_rt_error_cost __read_mostly = HZ;
126static int ip_rt_error_burst __read_mostly = 5 * HZ;
127static int ip_rt_gc_elasticity __read_mostly = 8;
128static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
129static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
130static int ip_rt_min_advmss __read_mostly = 256;
131static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
132static int rt_chain_length_max __read_mostly = 20;
133
134static struct delayed_work expires_work;
135static unsigned long expires_ljiffies;
136
137
138
139
140
141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142static void ipv4_dst_destroy(struct dst_entry *dst);
143static void ipv4_dst_ifdown(struct dst_entry *dst,
144 struct net_device *dev, int how);
145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146static void ipv4_link_failure(struct sk_buff *skb);
147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148static int rt_garbage_collect(struct dst_ops *ops);
149static void rt_emergency_hash_rebuild(struct net *net);
150
151
152static struct dst_ops ipv4_dst_ops = {
153 .family = AF_INET,
154 .protocol = cpu_to_be16(ETH_P_IP),
155 .gc = rt_garbage_collect,
156 .check = ipv4_dst_check,
157 .destroy = ipv4_dst_destroy,
158 .ifdown = ipv4_dst_ifdown,
159 .negative_advice = ipv4_negative_advice,
160 .link_failure = ipv4_link_failure,
161 .update_pmtu = ip_rt_update_pmtu,
162 .local_out = __ip_local_out,
163 .entries = ATOMIC_INIT(0),
164};
165
166#define ECN_OR_COST(class) TC_PRIO_##class
167
168const __u8 ip_tos2prio[16] = {
169 TC_PRIO_BESTEFFORT,
170 ECN_OR_COST(FILLER),
171 TC_PRIO_BESTEFFORT,
172 ECN_OR_COST(BESTEFFORT),
173 TC_PRIO_BULK,
174 ECN_OR_COST(BULK),
175 TC_PRIO_BULK,
176 ECN_OR_COST(BULK),
177 TC_PRIO_INTERACTIVE,
178 ECN_OR_COST(INTERACTIVE),
179 TC_PRIO_INTERACTIVE,
180 ECN_OR_COST(INTERACTIVE),
181 TC_PRIO_INTERACTIVE_BULK,
182 ECN_OR_COST(INTERACTIVE_BULK),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK)
185};
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202struct rt_hash_bucket {
203 struct rtable *chain;
204};
205
206#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
207 defined(CONFIG_PROVE_LOCKING)
208
209
210
211
212
213#ifdef CONFIG_LOCKDEP
214# define RT_HASH_LOCK_SZ 256
215#else
216# if NR_CPUS >= 32
217# define RT_HASH_LOCK_SZ 4096
218# elif NR_CPUS >= 16
219# define RT_HASH_LOCK_SZ 2048
220# elif NR_CPUS >= 8
221# define RT_HASH_LOCK_SZ 1024
222# elif NR_CPUS >= 4
223# define RT_HASH_LOCK_SZ 512
224# else
225# define RT_HASH_LOCK_SZ 256
226# endif
227#endif
228
229static spinlock_t *rt_hash_locks;
230# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
231
232static __init void rt_hash_lock_init(void)
233{
234 int i;
235
236 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
237 GFP_KERNEL);
238 if (!rt_hash_locks)
239 panic("IP: failed to allocate rt_hash_locks\n");
240
241 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
242 spin_lock_init(&rt_hash_locks[i]);
243}
244#else
245# define rt_hash_lock_addr(slot) NULL
246
247static inline void rt_hash_lock_init(void)
248{
249}
250#endif
251
252static struct rt_hash_bucket *rt_hash_table __read_mostly;
253static unsigned rt_hash_mask __read_mostly;
254static unsigned int rt_hash_log __read_mostly;
255
256static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
257#define RT_CACHE_STAT_INC(field) \
258 (__raw_get_cpu_var(rt_cache_stat).field++)
259
260static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
261 int genid)
262{
263 return jhash_3words((__force u32)(__be32)(daddr),
264 (__force u32)(__be32)(saddr),
265 idx, genid)
266 & rt_hash_mask;
267}
268
269static inline int rt_genid(struct net *net)
270{
271 return atomic_read(&net->ipv4.rt_genid);
272}
273
274#ifdef CONFIG_PROC_FS
275struct rt_cache_iter_state {
276 struct seq_net_private p;
277 int bucket;
278 int genid;
279};
280
281static struct rtable *rt_cache_get_first(struct seq_file *seq)
282{
283 struct rt_cache_iter_state *st = seq->private;
284 struct rtable *r = NULL;
285
286 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
287 if (!rt_hash_table[st->bucket].chain)
288 continue;
289 rcu_read_lock_bh();
290 r = rcu_dereference(rt_hash_table[st->bucket].chain);
291 while (r) {
292 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
293 r->rt_genid == st->genid)
294 return r;
295 r = rcu_dereference(r->u.dst.rt_next);
296 }
297 rcu_read_unlock_bh();
298 }
299 return r;
300}
301
302static struct rtable *__rt_cache_get_next(struct seq_file *seq,
303 struct rtable *r)
304{
305 struct rt_cache_iter_state *st = seq->private;
306
307 r = r->u.dst.rt_next;
308 while (!r) {
309 rcu_read_unlock_bh();
310 do {
311 if (--st->bucket < 0)
312 return NULL;
313 } while (!rt_hash_table[st->bucket].chain);
314 rcu_read_lock_bh();
315 r = rt_hash_table[st->bucket].chain;
316 }
317 return rcu_dereference(r);
318}
319
320static struct rtable *rt_cache_get_next(struct seq_file *seq,
321 struct rtable *r)
322{
323 struct rt_cache_iter_state *st = seq->private;
324 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
325 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
326 continue;
327 if (r->rt_genid == st->genid)
328 break;
329 }
330 return r;
331}
332
333static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
334{
335 struct rtable *r = rt_cache_get_first(seq);
336
337 if (r)
338 while (pos && (r = rt_cache_get_next(seq, r)))
339 --pos;
340 return pos ? NULL : r;
341}
342
343static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
344{
345 struct rt_cache_iter_state *st = seq->private;
346 if (*pos)
347 return rt_cache_get_idx(seq, *pos - 1);
348 st->genid = rt_genid(seq_file_net(seq));
349 return SEQ_START_TOKEN;
350}
351
352static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
353{
354 struct rtable *r;
355
356 if (v == SEQ_START_TOKEN)
357 r = rt_cache_get_first(seq);
358 else
359 r = rt_cache_get_next(seq, v);
360 ++*pos;
361 return r;
362}
363
364static void rt_cache_seq_stop(struct seq_file *seq, void *v)
365{
366 if (v && v != SEQ_START_TOKEN)
367 rcu_read_unlock_bh();
368}
369
370static int rt_cache_seq_show(struct seq_file *seq, void *v)
371{
372 if (v == SEQ_START_TOKEN)
373 seq_printf(seq, "%-127s\n",
374 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
375 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
376 "HHUptod\tSpecDst");
377 else {
378 struct rtable *r = v;
379 int len;
380
381 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
382 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
383 r->u.dst.dev ? r->u.dst.dev->name : "*",
384 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
385 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
386 r->u.dst.__use, 0, (unsigned long)r->rt_src,
387 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
388 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
389 dst_metric(&r->u.dst, RTAX_WINDOW),
390 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
391 dst_metric(&r->u.dst, RTAX_RTTVAR)),
392 r->fl.fl4_tos,
393 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
394 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
395 dev_queue_xmit) : 0,
396 r->rt_spec_dst, &len);
397
398 seq_printf(seq, "%*s\n", 127 - len, "");
399 }
400 return 0;
401}
402
403static const struct seq_operations rt_cache_seq_ops = {
404 .start = rt_cache_seq_start,
405 .next = rt_cache_seq_next,
406 .stop = rt_cache_seq_stop,
407 .show = rt_cache_seq_show,
408};
409
410static int rt_cache_seq_open(struct inode *inode, struct file *file)
411{
412 return seq_open_net(inode, file, &rt_cache_seq_ops,
413 sizeof(struct rt_cache_iter_state));
414}
415
416static const struct file_operations rt_cache_seq_fops = {
417 .owner = THIS_MODULE,
418 .open = rt_cache_seq_open,
419 .read = seq_read,
420 .llseek = seq_lseek,
421 .release = seq_release_net,
422};
423
424
425static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
426{
427 int cpu;
428
429 if (*pos == 0)
430 return SEQ_START_TOKEN;
431
432 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
433 if (!cpu_possible(cpu))
434 continue;
435 *pos = cpu+1;
436 return &per_cpu(rt_cache_stat, cpu);
437 }
438 return NULL;
439}
440
441static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
442{
443 int cpu;
444
445 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
446 if (!cpu_possible(cpu))
447 continue;
448 *pos = cpu+1;
449 return &per_cpu(rt_cache_stat, cpu);
450 }
451 return NULL;
452
453}
454
455static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
456{
457
458}
459
460static int rt_cpu_seq_show(struct seq_file *seq, void *v)
461{
462 struct rt_cache_stat *st = v;
463
464 if (v == SEQ_START_TOKEN) {
465 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
466 return 0;
467 }
468
469 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
470 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
471 atomic_read(&ipv4_dst_ops.entries),
472 st->in_hit,
473 st->in_slow_tot,
474 st->in_slow_mc,
475 st->in_no_route,
476 st->in_brd,
477 st->in_martian_dst,
478 st->in_martian_src,
479
480 st->out_hit,
481 st->out_slow_tot,
482 st->out_slow_mc,
483
484 st->gc_total,
485 st->gc_ignored,
486 st->gc_goal_miss,
487 st->gc_dst_overflow,
488 st->in_hlist_search,
489 st->out_hlist_search
490 );
491 return 0;
492}
493
494static const struct seq_operations rt_cpu_seq_ops = {
495 .start = rt_cpu_seq_start,
496 .next = rt_cpu_seq_next,
497 .stop = rt_cpu_seq_stop,
498 .show = rt_cpu_seq_show,
499};
500
501
502static int rt_cpu_seq_open(struct inode *inode, struct file *file)
503{
504 return seq_open(file, &rt_cpu_seq_ops);
505}
506
507static const struct file_operations rt_cpu_seq_fops = {
508 .owner = THIS_MODULE,
509 .open = rt_cpu_seq_open,
510 .read = seq_read,
511 .llseek = seq_lseek,
512 .release = seq_release,
513};
514
515#ifdef CONFIG_NET_CLS_ROUTE
516static int rt_acct_proc_show(struct seq_file *m, void *v)
517{
518 struct ip_rt_acct *dst, *src;
519 unsigned int i, j;
520
521 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
522 if (!dst)
523 return -ENOMEM;
524
525 for_each_possible_cpu(i) {
526 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
527 for (j = 0; j < 256; j++) {
528 dst[j].o_bytes += src[j].o_bytes;
529 dst[j].o_packets += src[j].o_packets;
530 dst[j].i_bytes += src[j].i_bytes;
531 dst[j].i_packets += src[j].i_packets;
532 }
533 }
534
535 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
536 kfree(dst);
537 return 0;
538}
539
540static int rt_acct_proc_open(struct inode *inode, struct file *file)
541{
542 return single_open(file, rt_acct_proc_show, NULL);
543}
544
545static const struct file_operations rt_acct_proc_fops = {
546 .owner = THIS_MODULE,
547 .open = rt_acct_proc_open,
548 .read = seq_read,
549 .llseek = seq_lseek,
550 .release = single_release,
551};
552#endif
553
554static int __net_init ip_rt_do_proc_init(struct net *net)
555{
556 struct proc_dir_entry *pde;
557
558 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
559 &rt_cache_seq_fops);
560 if (!pde)
561 goto err1;
562
563 pde = proc_create("rt_cache", S_IRUGO,
564 net->proc_net_stat, &rt_cpu_seq_fops);
565 if (!pde)
566 goto err2;
567
568#ifdef CONFIG_NET_CLS_ROUTE
569 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
570 if (!pde)
571 goto err3;
572#endif
573 return 0;
574
575#ifdef CONFIG_NET_CLS_ROUTE
576err3:
577 remove_proc_entry("rt_cache", net->proc_net_stat);
578#endif
579err2:
580 remove_proc_entry("rt_cache", net->proc_net);
581err1:
582 return -ENOMEM;
583}
584
585static void __net_exit ip_rt_do_proc_exit(struct net *net)
586{
587 remove_proc_entry("rt_cache", net->proc_net_stat);
588 remove_proc_entry("rt_cache", net->proc_net);
589#ifdef CONFIG_NET_CLS_ROUTE
590 remove_proc_entry("rt_acct", net->proc_net);
591#endif
592}
593
594static struct pernet_operations ip_rt_proc_ops __net_initdata = {
595 .init = ip_rt_do_proc_init,
596 .exit = ip_rt_do_proc_exit,
597};
598
599static int __init ip_rt_proc_init(void)
600{
601 return register_pernet_subsys(&ip_rt_proc_ops);
602}
603
604#else
605static inline int ip_rt_proc_init(void)
606{
607 return 0;
608}
609#endif
610
611static inline void rt_free(struct rtable *rt)
612{
613 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
614}
615
616static inline void rt_drop(struct rtable *rt)
617{
618 ip_rt_put(rt);
619 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
620}
621
622static inline int rt_fast_clean(struct rtable *rth)
623{
624
625
626 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
627 rth->fl.iif && rth->u.dst.rt_next;
628}
629
630static inline int rt_valuable(struct rtable *rth)
631{
632 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
633 rth->u.dst.expires;
634}
635
636static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
637{
638 unsigned long age;
639 int ret = 0;
640
641 if (atomic_read(&rth->u.dst.__refcnt))
642 goto out;
643
644 ret = 1;
645 if (rth->u.dst.expires &&
646 time_after_eq(jiffies, rth->u.dst.expires))
647 goto out;
648
649 age = jiffies - rth->u.dst.lastuse;
650 ret = 0;
651 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
652 (age <= tmo2 && rt_valuable(rth)))
653 goto out;
654 ret = 1;
655out: return ret;
656}
657
658
659
660
661
662
663static inline u32 rt_score(struct rtable *rt)
664{
665 u32 score = jiffies - rt->u.dst.lastuse;
666
667 score = ~score & ~(3<<30);
668
669 if (rt_valuable(rt))
670 score |= (1<<31);
671
672 if (!rt->fl.iif ||
673 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
674 score |= (1<<30);
675
676 return score;
677}
678
679static inline bool rt_caching(const struct net *net)
680{
681 return net->ipv4.current_rt_cache_rebuild_count <=
682 net->ipv4.sysctl_rt_cache_rebuild_count;
683}
684
685static inline bool compare_hash_inputs(const struct flowi *fl1,
686 const struct flowi *fl2)
687{
688 return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
689 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
690 (fl1->iif ^ fl2->iif)) == 0);
691}
692
693static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
694{
695 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
696 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
697 (fl1->mark ^ fl2->mark) |
698 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
699 *(u16 *)&fl2->nl_u.ip4_u.tos) |
700 (fl1->oif ^ fl2->oif) |
701 (fl1->iif ^ fl2->iif)) == 0;
702}
703
704static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
705{
706 return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
707}
708
709static inline int rt_is_expired(struct rtable *rth)
710{
711 return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
712}
713
714
715
716
717
718
719static void rt_do_flush(int process_context)
720{
721 unsigned int i;
722 struct rtable *rth, *next;
723 struct rtable * tail;
724
725 for (i = 0; i <= rt_hash_mask; i++) {
726 if (process_context && need_resched())
727 cond_resched();
728 rth = rt_hash_table[i].chain;
729 if (!rth)
730 continue;
731
732 spin_lock_bh(rt_hash_lock_addr(i));
733#ifdef CONFIG_NET_NS
734 {
735 struct rtable ** prev, * p;
736
737 rth = rt_hash_table[i].chain;
738
739
740 for (tail = rth; tail; tail = tail->u.dst.rt_next)
741 if (!rt_is_expired(tail))
742 break;
743 if (rth != tail)
744 rt_hash_table[i].chain = tail;
745
746
747 prev = &rt_hash_table[i].chain;
748 for (p = *prev; p; p = next) {
749 next = p->u.dst.rt_next;
750 if (!rt_is_expired(p)) {
751 prev = &p->u.dst.rt_next;
752 } else {
753 *prev = next;
754 rt_free(p);
755 }
756 }
757 }
758#else
759 rth = rt_hash_table[i].chain;
760 rt_hash_table[i].chain = NULL;
761 tail = NULL;
762#endif
763 spin_unlock_bh(rt_hash_lock_addr(i));
764
765 for (; rth != tail; rth = next) {
766 next = rth->u.dst.rt_next;
767 rt_free(rth);
768 }
769 }
770}
771
772
773
774
775
776
777
778
779
780#define FRACT_BITS 3
781#define ONE (1UL << FRACT_BITS)
782
783static void rt_check_expire(void)
784{
785 static unsigned int rover;
786 unsigned int i = rover, goal;
787 struct rtable *rth, *aux, **rthp;
788 unsigned long samples = 0;
789 unsigned long sum = 0, sum2 = 0;
790 unsigned long delta;
791 u64 mult;
792
793 delta = jiffies - expires_ljiffies;
794 expires_ljiffies = jiffies;
795 mult = ((u64)delta) << rt_hash_log;
796 if (ip_rt_gc_timeout > 1)
797 do_div(mult, ip_rt_gc_timeout);
798 goal = (unsigned int)mult;
799 if (goal > rt_hash_mask)
800 goal = rt_hash_mask + 1;
801 for (; goal > 0; goal--) {
802 unsigned long tmo = ip_rt_gc_timeout;
803 unsigned long length;
804
805 i = (i + 1) & rt_hash_mask;
806 rthp = &rt_hash_table[i].chain;
807
808 if (need_resched())
809 cond_resched();
810
811 samples++;
812
813 if (*rthp == NULL)
814 continue;
815 length = 0;
816 spin_lock_bh(rt_hash_lock_addr(i));
817 while ((rth = *rthp) != NULL) {
818 prefetch(rth->u.dst.rt_next);
819 if (rt_is_expired(rth)) {
820 *rthp = rth->u.dst.rt_next;
821 rt_free(rth);
822 continue;
823 }
824 if (rth->u.dst.expires) {
825
826 if (time_before_eq(jiffies, rth->u.dst.expires)) {
827nofree:
828 tmo >>= 1;
829 rthp = &rth->u.dst.rt_next;
830
831
832
833
834
835
836
837
838 for (aux = rt_hash_table[i].chain;;) {
839 if (aux == rth) {
840 length += ONE;
841 break;
842 }
843 if (compare_hash_inputs(&aux->fl, &rth->fl))
844 break;
845 aux = aux->u.dst.rt_next;
846 }
847 continue;
848 }
849 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
850 goto nofree;
851
852
853 *rthp = rth->u.dst.rt_next;
854 rt_free(rth);
855 }
856 spin_unlock_bh(rt_hash_lock_addr(i));
857 sum += length;
858 sum2 += length*length;
859 }
860 if (samples) {
861 unsigned long avg = sum / samples;
862 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
863 rt_chain_length_max = max_t(unsigned long,
864 ip_rt_gc_elasticity,
865 (avg + 4*sd) >> FRACT_BITS);
866 }
867 rover = i;
868}
869
870
871
872
873
874static void rt_worker_func(struct work_struct *work)
875{
876 rt_check_expire();
877 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
878}
879
880
881
882
883
884
885
886static void rt_cache_invalidate(struct net *net)
887{
888 unsigned char shuffle;
889
890 get_random_bytes(&shuffle, sizeof(shuffle));
891 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
892}
893
894
895
896
897
898void rt_cache_flush(struct net *net, int delay)
899{
900 rt_cache_invalidate(net);
901 if (delay >= 0)
902 rt_do_flush(!in_softirq());
903}
904
905
906void rt_cache_flush_batch(void)
907{
908 rt_do_flush(!in_softirq());
909}
910
911
912
913
914static void rt_secret_rebuild(unsigned long __net)
915{
916 struct net *net = (struct net *)__net;
917 rt_cache_invalidate(net);
918 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
919}
920
921static void rt_secret_rebuild_oneshot(struct net *net)
922{
923 del_timer_sync(&net->ipv4.rt_secret_timer);
924 rt_cache_invalidate(net);
925 if (ip_rt_secret_interval) {
926 net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
927 add_timer(&net->ipv4.rt_secret_timer);
928 }
929}
930
931static void rt_emergency_hash_rebuild(struct net *net)
932{
933 if (net_ratelimit()) {
934 printk(KERN_WARNING "Route hash chain too long!\n");
935 printk(KERN_WARNING "Adjust your secret_interval!\n");
936 }
937
938 rt_secret_rebuild_oneshot(net);
939}
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954static int rt_garbage_collect(struct dst_ops *ops)
955{
956 static unsigned long expire = RT_GC_TIMEOUT;
957 static unsigned long last_gc;
958 static int rover;
959 static int equilibrium;
960 struct rtable *rth, **rthp;
961 unsigned long now = jiffies;
962 int goal;
963
964
965
966
967
968
969 RT_CACHE_STAT_INC(gc_total);
970
971 if (now - last_gc < ip_rt_gc_min_interval &&
972 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
973 RT_CACHE_STAT_INC(gc_ignored);
974 goto out;
975 }
976
977
978 goal = atomic_read(&ipv4_dst_ops.entries) -
979 (ip_rt_gc_elasticity << rt_hash_log);
980 if (goal <= 0) {
981 if (equilibrium < ipv4_dst_ops.gc_thresh)
982 equilibrium = ipv4_dst_ops.gc_thresh;
983 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
984 if (goal > 0) {
985 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
986 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
987 }
988 } else {
989
990
991
992 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
993 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
994 }
995
996 if (now - last_gc >= ip_rt_gc_min_interval)
997 last_gc = now;
998
999 if (goal <= 0) {
1000 equilibrium += goal;
1001 goto work_done;
1002 }
1003
1004 do {
1005 int i, k;
1006
1007 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1008 unsigned long tmo = expire;
1009
1010 k = (k + 1) & rt_hash_mask;
1011 rthp = &rt_hash_table[k].chain;
1012 spin_lock_bh(rt_hash_lock_addr(k));
1013 while ((rth = *rthp) != NULL) {
1014 if (!rt_is_expired(rth) &&
1015 !rt_may_expire(rth, tmo, expire)) {
1016 tmo >>= 1;
1017 rthp = &rth->u.dst.rt_next;
1018 continue;
1019 }
1020 *rthp = rth->u.dst.rt_next;
1021 rt_free(rth);
1022 goal--;
1023 }
1024 spin_unlock_bh(rt_hash_lock_addr(k));
1025 if (goal <= 0)
1026 break;
1027 }
1028 rover = k;
1029
1030 if (goal <= 0)
1031 goto work_done;
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042 RT_CACHE_STAT_INC(gc_goal_miss);
1043
1044 if (expire == 0)
1045 break;
1046
1047 expire >>= 1;
1048#if RT_CACHE_DEBUG >= 2
1049 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1050 atomic_read(&ipv4_dst_ops.entries), goal, i);
1051#endif
1052
1053 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1054 goto out;
1055 } while (!in_softirq() && time_before_eq(jiffies, now));
1056
1057 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1058 goto out;
1059 if (net_ratelimit())
1060 printk(KERN_WARNING "dst cache overflow\n");
1061 RT_CACHE_STAT_INC(gc_dst_overflow);
1062 return 1;
1063
1064work_done:
1065 expire += ip_rt_gc_min_interval;
1066 if (expire > ip_rt_gc_timeout ||
1067 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1068 expire = ip_rt_gc_timeout;
1069#if RT_CACHE_DEBUG >= 2
1070 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1071 atomic_read(&ipv4_dst_ops.entries), goal, rover);
1072#endif
1073out: return 0;
1074}
1075
1076static int rt_intern_hash(unsigned hash, struct rtable *rt,
1077 struct rtable **rp, struct sk_buff *skb)
1078{
1079 struct rtable *rth, **rthp;
1080 unsigned long now;
1081 struct rtable *cand, **candp;
1082 u32 min_score;
1083 int chain_length;
1084 int attempts = !in_softirq();
1085
1086restart:
1087 chain_length = 0;
1088 min_score = ~(u32)0;
1089 cand = NULL;
1090 candp = NULL;
1091 now = jiffies;
1092
1093 if (!rt_caching(dev_net(rt->u.dst.dev))) {
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1111 int err = arp_bind_neighbour(&rt->u.dst);
1112 if (err) {
1113 if (net_ratelimit())
1114 printk(KERN_WARNING
1115 "Neighbour table failure & not caching routes.\n");
1116 rt_drop(rt);
1117 return err;
1118 }
1119 }
1120
1121 rt_free(rt);
1122 goto skip_hashing;
1123 }
1124
1125 rthp = &rt_hash_table[hash].chain;
1126
1127 spin_lock_bh(rt_hash_lock_addr(hash));
1128 while ((rth = *rthp) != NULL) {
1129 if (rt_is_expired(rth)) {
1130 *rthp = rth->u.dst.rt_next;
1131 rt_free(rth);
1132 continue;
1133 }
1134 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1135
1136 *rthp = rth->u.dst.rt_next;
1137
1138
1139
1140
1141
1142 rcu_assign_pointer(rth->u.dst.rt_next,
1143 rt_hash_table[hash].chain);
1144
1145
1146
1147
1148 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1149
1150 dst_use(&rth->u.dst, now);
1151 spin_unlock_bh(rt_hash_lock_addr(hash));
1152
1153 rt_drop(rt);
1154 if (rp)
1155 *rp = rth;
1156 else
1157 skb_dst_set(skb, &rth->u.dst);
1158 return 0;
1159 }
1160
1161 if (!atomic_read(&rth->u.dst.__refcnt)) {
1162 u32 score = rt_score(rth);
1163
1164 if (score <= min_score) {
1165 cand = rth;
1166 candp = rthp;
1167 min_score = score;
1168 }
1169 }
1170
1171 chain_length++;
1172
1173 rthp = &rth->u.dst.rt_next;
1174 }
1175
1176 if (cand) {
1177
1178
1179
1180
1181
1182
1183 if (chain_length > ip_rt_gc_elasticity) {
1184 *candp = cand->u.dst.rt_next;
1185 rt_free(cand);
1186 }
1187 } else {
1188 if (chain_length > rt_chain_length_max) {
1189 struct net *net = dev_net(rt->u.dst.dev);
1190 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1191 if (!rt_caching(dev_net(rt->u.dst.dev))) {
1192 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1193 rt->u.dst.dev->name, num);
1194 }
1195 rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1196 }
1197 }
1198
1199
1200
1201
1202 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1203 int err = arp_bind_neighbour(&rt->u.dst);
1204 if (err) {
1205 spin_unlock_bh(rt_hash_lock_addr(hash));
1206
1207 if (err != -ENOBUFS) {
1208 rt_drop(rt);
1209 return err;
1210 }
1211
1212
1213
1214
1215
1216 if (attempts-- > 0) {
1217 int saved_elasticity = ip_rt_gc_elasticity;
1218 int saved_int = ip_rt_gc_min_interval;
1219 ip_rt_gc_elasticity = 1;
1220 ip_rt_gc_min_interval = 0;
1221 rt_garbage_collect(&ipv4_dst_ops);
1222 ip_rt_gc_min_interval = saved_int;
1223 ip_rt_gc_elasticity = saved_elasticity;
1224 goto restart;
1225 }
1226
1227 if (net_ratelimit())
1228 printk(KERN_WARNING "Neighbour table overflow.\n");
1229 rt_drop(rt);
1230 return -ENOBUFS;
1231 }
1232 }
1233
1234 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1235
1236#if RT_CACHE_DEBUG >= 2
1237 if (rt->u.dst.rt_next) {
1238 struct rtable *trt;
1239 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1240 hash, &rt->rt_dst);
1241 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1242 printk(" . %pI4", &trt->rt_dst);
1243 printk("\n");
1244 }
1245#endif
1246
1247
1248
1249
1250
1251 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1252
1253 spin_unlock_bh(rt_hash_lock_addr(hash));
1254
1255skip_hashing:
1256 if (rp)
1257 *rp = rt;
1258 else
1259 skb_dst_set(skb, &rt->u.dst);
1260 return 0;
1261}
1262
1263void rt_bind_peer(struct rtable *rt, int create)
1264{
1265 static DEFINE_SPINLOCK(rt_peer_lock);
1266 struct inet_peer *peer;
1267
1268 peer = inet_getpeer(rt->rt_dst, create);
1269
1270 spin_lock_bh(&rt_peer_lock);
1271 if (rt->peer == NULL) {
1272 rt->peer = peer;
1273 peer = NULL;
1274 }
1275 spin_unlock_bh(&rt_peer_lock);
1276 if (peer)
1277 inet_putpeer(peer);
1278}
1279
1280
1281
1282
1283
1284
1285
1286
1287static void ip_select_fb_ident(struct iphdr *iph)
1288{
1289 static DEFINE_SPINLOCK(ip_fb_id_lock);
1290 static u32 ip_fallback_id;
1291 u32 salt;
1292
1293 spin_lock_bh(&ip_fb_id_lock);
1294 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1295 iph->id = htons(salt & 0xFFFF);
1296 ip_fallback_id = salt;
1297 spin_unlock_bh(&ip_fb_id_lock);
1298}
1299
1300void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1301{
1302 struct rtable *rt = (struct rtable *) dst;
1303
1304 if (rt) {
1305 if (rt->peer == NULL)
1306 rt_bind_peer(rt, 1);
1307
1308
1309
1310
1311 if (rt->peer) {
1312 iph->id = htons(inet_getid(rt->peer, more));
1313 return;
1314 }
1315 } else
1316 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1317 __builtin_return_address(0));
1318
1319 ip_select_fb_ident(iph);
1320}
1321
1322static void rt_del(unsigned hash, struct rtable *rt)
1323{
1324 struct rtable **rthp, *aux;
1325
1326 rthp = &rt_hash_table[hash].chain;
1327 spin_lock_bh(rt_hash_lock_addr(hash));
1328 ip_rt_put(rt);
1329 while ((aux = *rthp) != NULL) {
1330 if (aux == rt || rt_is_expired(aux)) {
1331 *rthp = aux->u.dst.rt_next;
1332 rt_free(aux);
1333 continue;
1334 }
1335 rthp = &aux->u.dst.rt_next;
1336 }
1337 spin_unlock_bh(rt_hash_lock_addr(hash));
1338}
1339
1340void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1341 __be32 saddr, struct net_device *dev)
1342{
1343 int i, k;
1344 struct in_device *in_dev = in_dev_get(dev);
1345 struct rtable *rth, **rthp;
1346 __be32 skeys[2] = { saddr, 0 };
1347 int ikeys[2] = { dev->ifindex, 0 };
1348 struct netevent_redirect netevent;
1349 struct net *net;
1350
1351 if (!in_dev)
1352 return;
1353
1354 net = dev_net(dev);
1355 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1356 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1357 ipv4_is_zeronet(new_gw))
1358 goto reject_redirect;
1359
1360 if (!rt_caching(net))
1361 goto reject_redirect;
1362
1363 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1364 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1365 goto reject_redirect;
1366 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1367 goto reject_redirect;
1368 } else {
1369 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1370 goto reject_redirect;
1371 }
1372
1373 for (i = 0; i < 2; i++) {
1374 for (k = 0; k < 2; k++) {
1375 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1376 rt_genid(net));
1377
1378 rthp=&rt_hash_table[hash].chain;
1379
1380 rcu_read_lock();
1381 while ((rth = rcu_dereference(*rthp)) != NULL) {
1382 struct rtable *rt;
1383
1384 if (rth->fl.fl4_dst != daddr ||
1385 rth->fl.fl4_src != skeys[i] ||
1386 rth->fl.oif != ikeys[k] ||
1387 rth->fl.iif != 0 ||
1388 rt_is_expired(rth) ||
1389 !net_eq(dev_net(rth->u.dst.dev), net)) {
1390 rthp = &rth->u.dst.rt_next;
1391 continue;
1392 }
1393
1394 if (rth->rt_dst != daddr ||
1395 rth->rt_src != saddr ||
1396 rth->u.dst.error ||
1397 rth->rt_gateway != old_gw ||
1398 rth->u.dst.dev != dev)
1399 break;
1400
1401 dst_hold(&rth->u.dst);
1402 rcu_read_unlock();
1403
1404 rt = dst_alloc(&ipv4_dst_ops);
1405 if (rt == NULL) {
1406 ip_rt_put(rth);
1407 in_dev_put(in_dev);
1408 return;
1409 }
1410
1411
1412 *rt = *rth;
1413 rt->u.dst.__use = 1;
1414 atomic_set(&rt->u.dst.__refcnt, 1);
1415 rt->u.dst.child = NULL;
1416 if (rt->u.dst.dev)
1417 dev_hold(rt->u.dst.dev);
1418 if (rt->idev)
1419 in_dev_hold(rt->idev);
1420 rt->u.dst.obsolete = 0;
1421 rt->u.dst.lastuse = jiffies;
1422 rt->u.dst.path = &rt->u.dst;
1423 rt->u.dst.neighbour = NULL;
1424 rt->u.dst.hh = NULL;
1425#ifdef CONFIG_XFRM
1426 rt->u.dst.xfrm = NULL;
1427#endif
1428 rt->rt_genid = rt_genid(net);
1429 rt->rt_flags |= RTCF_REDIRECTED;
1430
1431
1432 rt->rt_gateway = new_gw;
1433
1434
1435 dst_confirm(&rth->u.dst);
1436
1437 if (rt->peer)
1438 atomic_inc(&rt->peer->refcnt);
1439
1440 if (arp_bind_neighbour(&rt->u.dst) ||
1441 !(rt->u.dst.neighbour->nud_state &
1442 NUD_VALID)) {
1443 if (rt->u.dst.neighbour)
1444 neigh_event_send(rt->u.dst.neighbour, NULL);
1445 ip_rt_put(rth);
1446 rt_drop(rt);
1447 goto do_next;
1448 }
1449
1450 netevent.old = &rth->u.dst;
1451 netevent.new = &rt->u.dst;
1452 call_netevent_notifiers(NETEVENT_REDIRECT,
1453 &netevent);
1454
1455 rt_del(hash, rth);
1456 if (!rt_intern_hash(hash, rt, &rt, NULL))
1457 ip_rt_put(rt);
1458 goto do_next;
1459 }
1460 rcu_read_unlock();
1461 do_next:
1462 ;
1463 }
1464 }
1465 in_dev_put(in_dev);
1466 return;
1467
1468reject_redirect:
1469#ifdef CONFIG_IP_ROUTE_VERBOSE
1470 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1471 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1472 " Advised path = %pI4 -> %pI4\n",
1473 &old_gw, dev->name, &new_gw,
1474 &saddr, &daddr);
1475#endif
1476 in_dev_put(in_dev);
1477}
1478
1479static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1480{
1481 struct rtable *rt = (struct rtable *)dst;
1482 struct dst_entry *ret = dst;
1483
1484 if (rt) {
1485 if (dst->obsolete) {
1486 ip_rt_put(rt);
1487 ret = NULL;
1488 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1489 rt->u.dst.expires) {
1490 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1491 rt->fl.oif,
1492 rt_genid(dev_net(dst->dev)));
1493#if RT_CACHE_DEBUG >= 1
1494 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1495 &rt->rt_dst, rt->fl.fl4_tos);
1496#endif
1497 rt_del(hash, rt);
1498 ret = NULL;
1499 }
1500 }
1501 return ret;
1502}
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520void ip_rt_send_redirect(struct sk_buff *skb)
1521{
1522 struct rtable *rt = skb_rtable(skb);
1523 struct in_device *in_dev;
1524 int log_martians;
1525
1526 rcu_read_lock();
1527 in_dev = __in_dev_get_rcu(rt->u.dst.dev);
1528 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1529 rcu_read_unlock();
1530 return;
1531 }
1532 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1533 rcu_read_unlock();
1534
1535
1536
1537
1538 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1539 rt->u.dst.rate_tokens = 0;
1540
1541
1542
1543
1544 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1545 rt->u.dst.rate_last = jiffies;
1546 return;
1547 }
1548
1549
1550
1551
1552 if (rt->u.dst.rate_tokens == 0 ||
1553 time_after(jiffies,
1554 (rt->u.dst.rate_last +
1555 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1556 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1557 rt->u.dst.rate_last = jiffies;
1558 ++rt->u.dst.rate_tokens;
1559#ifdef CONFIG_IP_ROUTE_VERBOSE
1560 if (log_martians &&
1561 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1562 net_ratelimit())
1563 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1564 &rt->rt_src, rt->rt_iif,
1565 &rt->rt_dst, &rt->rt_gateway);
1566#endif
1567 }
1568}
1569
1570static int ip_error(struct sk_buff *skb)
1571{
1572 struct rtable *rt = skb_rtable(skb);
1573 unsigned long now;
1574 int code;
1575
1576 switch (rt->u.dst.error) {
1577 case EINVAL:
1578 default:
1579 goto out;
1580 case EHOSTUNREACH:
1581 code = ICMP_HOST_UNREACH;
1582 break;
1583 case ENETUNREACH:
1584 code = ICMP_NET_UNREACH;
1585 IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1586 IPSTATS_MIB_INNOROUTES);
1587 break;
1588 case EACCES:
1589 code = ICMP_PKT_FILTERED;
1590 break;
1591 }
1592
1593 now = jiffies;
1594 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1595 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1596 rt->u.dst.rate_tokens = ip_rt_error_burst;
1597 rt->u.dst.rate_last = now;
1598 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1599 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1600 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1601 }
1602
1603out: kfree_skb(skb);
1604 return 0;
1605}
1606
1607
1608
1609
1610
1611
1612static const unsigned short mtu_plateau[] =
1613{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1614
1615static inline unsigned short guess_mtu(unsigned short old_mtu)
1616{
1617 int i;
1618
1619 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1620 if (old_mtu > mtu_plateau[i])
1621 return mtu_plateau[i];
1622 return 68;
1623}
1624
1625unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1626 unsigned short new_mtu,
1627 struct net_device *dev)
1628{
1629 int i, k;
1630 unsigned short old_mtu = ntohs(iph->tot_len);
1631 struct rtable *rth;
1632 int ikeys[2] = { dev->ifindex, 0 };
1633 __be32 skeys[2] = { iph->saddr, 0, };
1634 __be32 daddr = iph->daddr;
1635 unsigned short est_mtu = 0;
1636
1637 for (k = 0; k < 2; k++) {
1638 for (i = 0; i < 2; i++) {
1639 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1640 rt_genid(net));
1641
1642 rcu_read_lock();
1643 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1644 rth = rcu_dereference(rth->u.dst.rt_next)) {
1645 unsigned short mtu = new_mtu;
1646
1647 if (rth->fl.fl4_dst != daddr ||
1648 rth->fl.fl4_src != skeys[i] ||
1649 rth->rt_dst != daddr ||
1650 rth->rt_src != iph->saddr ||
1651 rth->fl.oif != ikeys[k] ||
1652 rth->fl.iif != 0 ||
1653 dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1654 !net_eq(dev_net(rth->u.dst.dev), net) ||
1655 rt_is_expired(rth))
1656 continue;
1657
1658 if (new_mtu < 68 || new_mtu >= old_mtu) {
1659
1660
1661 if (mtu == 0 &&
1662 old_mtu >= dst_mtu(&rth->u.dst) &&
1663 old_mtu >= 68 + (iph->ihl << 2))
1664 old_mtu -= iph->ihl << 2;
1665
1666 mtu = guess_mtu(old_mtu);
1667 }
1668 if (mtu <= dst_mtu(&rth->u.dst)) {
1669 if (mtu < dst_mtu(&rth->u.dst)) {
1670 dst_confirm(&rth->u.dst);
1671 if (mtu < ip_rt_min_pmtu) {
1672 mtu = ip_rt_min_pmtu;
1673 rth->u.dst.metrics[RTAX_LOCK-1] |=
1674 (1 << RTAX_MTU);
1675 }
1676 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1677 dst_set_expires(&rth->u.dst,
1678 ip_rt_mtu_expires);
1679 }
1680 est_mtu = mtu;
1681 }
1682 }
1683 rcu_read_unlock();
1684 }
1685 }
1686 return est_mtu ? : new_mtu;
1687}
1688
1689static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1690{
1691 if (dst_mtu(dst) > mtu && mtu >= 68 &&
1692 !(dst_metric_locked(dst, RTAX_MTU))) {
1693 if (mtu < ip_rt_min_pmtu) {
1694 mtu = ip_rt_min_pmtu;
1695 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1696 }
1697 dst->metrics[RTAX_MTU-1] = mtu;
1698 dst_set_expires(dst, ip_rt_mtu_expires);
1699 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1700 }
1701}
1702
1703static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1704{
1705 return NULL;
1706}
1707
1708static void ipv4_dst_destroy(struct dst_entry *dst)
1709{
1710 struct rtable *rt = (struct rtable *) dst;
1711 struct inet_peer *peer = rt->peer;
1712 struct in_device *idev = rt->idev;
1713
1714 if (peer) {
1715 rt->peer = NULL;
1716 inet_putpeer(peer);
1717 }
1718
1719 if (idev) {
1720 rt->idev = NULL;
1721 in_dev_put(idev);
1722 }
1723}
1724
1725static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1726 int how)
1727{
1728 struct rtable *rt = (struct rtable *) dst;
1729 struct in_device *idev = rt->idev;
1730 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1731 struct in_device *loopback_idev =
1732 in_dev_get(dev_net(dev)->loopback_dev);
1733 if (loopback_idev) {
1734 rt->idev = loopback_idev;
1735 in_dev_put(idev);
1736 }
1737 }
1738}
1739
1740static void ipv4_link_failure(struct sk_buff *skb)
1741{
1742 struct rtable *rt;
1743
1744 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1745
1746 rt = skb_rtable(skb);
1747 if (rt)
1748 dst_set_expires(&rt->u.dst, 0);
1749}
1750
1751static int ip_rt_bug(struct sk_buff *skb)
1752{
1753 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1754 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1755 skb->dev ? skb->dev->name : "?");
1756 kfree_skb(skb);
1757 return 0;
1758}
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769void ip_rt_get_source(u8 *addr, struct rtable *rt)
1770{
1771 __be32 src;
1772 struct fib_result res;
1773
1774 if (rt->fl.iif == 0)
1775 src = rt->rt_src;
1776 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1777 src = FIB_RES_PREFSRC(res);
1778 fib_res_put(&res);
1779 } else
1780 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1781 RT_SCOPE_UNIVERSE);
1782 memcpy(addr, &src, 4);
1783}
1784
1785#ifdef CONFIG_NET_CLS_ROUTE
1786static void set_class_tag(struct rtable *rt, u32 tag)
1787{
1788 if (!(rt->u.dst.tclassid & 0xFFFF))
1789 rt->u.dst.tclassid |= tag & 0xFFFF;
1790 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1791 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1792}
1793#endif
1794
1795static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1796{
1797 struct fib_info *fi = res->fi;
1798
1799 if (fi) {
1800 if (FIB_RES_GW(*res) &&
1801 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1802 rt->rt_gateway = FIB_RES_GW(*res);
1803 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1804 sizeof(rt->u.dst.metrics));
1805 if (fi->fib_mtu == 0) {
1806 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1807 if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1808 rt->rt_gateway != rt->rt_dst &&
1809 rt->u.dst.dev->mtu > 576)
1810 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1811 }
1812#ifdef CONFIG_NET_CLS_ROUTE
1813 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1814#endif
1815 } else
1816 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1817
1818 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1819 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1820 if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1821 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1822 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1823 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1824 ip_rt_min_advmss);
1825 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1826 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1827
1828#ifdef CONFIG_NET_CLS_ROUTE
1829#ifdef CONFIG_IP_MULTIPLE_TABLES
1830 set_class_tag(rt, fib_rules_tclass(res));
1831#endif
1832 set_class_tag(rt, itag);
1833#endif
1834 rt->rt_type = res->type;
1835}
1836
1837static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1838 u8 tos, struct net_device *dev, int our)
1839{
1840 unsigned hash;
1841 struct rtable *rth;
1842 __be32 spec_dst;
1843 struct in_device *in_dev = in_dev_get(dev);
1844 u32 itag = 0;
1845
1846
1847
1848 if (in_dev == NULL)
1849 return -EINVAL;
1850
1851 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1852 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1853 goto e_inval;
1854
1855 if (ipv4_is_zeronet(saddr)) {
1856 if (!ipv4_is_local_multicast(daddr))
1857 goto e_inval;
1858 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1859 } else if (fib_validate_source(saddr, 0, tos, 0,
1860 dev, &spec_dst, &itag, 0) < 0)
1861 goto e_inval;
1862
1863 rth = dst_alloc(&ipv4_dst_ops);
1864 if (!rth)
1865 goto e_nobufs;
1866
1867 rth->u.dst.output= ip_rt_bug;
1868
1869 atomic_set(&rth->u.dst.__refcnt, 1);
1870 rth->u.dst.flags= DST_HOST;
1871 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1872 rth->u.dst.flags |= DST_NOPOLICY;
1873 rth->fl.fl4_dst = daddr;
1874 rth->rt_dst = daddr;
1875 rth->fl.fl4_tos = tos;
1876 rth->fl.mark = skb->mark;
1877 rth->fl.fl4_src = saddr;
1878 rth->rt_src = saddr;
1879#ifdef CONFIG_NET_CLS_ROUTE
1880 rth->u.dst.tclassid = itag;
1881#endif
1882 rth->rt_iif =
1883 rth->fl.iif = dev->ifindex;
1884 rth->u.dst.dev = init_net.loopback_dev;
1885 dev_hold(rth->u.dst.dev);
1886 rth->idev = in_dev_get(rth->u.dst.dev);
1887 rth->fl.oif = 0;
1888 rth->rt_gateway = daddr;
1889 rth->rt_spec_dst= spec_dst;
1890 rth->rt_genid = rt_genid(dev_net(dev));
1891 rth->rt_flags = RTCF_MULTICAST;
1892 rth->rt_type = RTN_MULTICAST;
1893 if (our) {
1894 rth->u.dst.input= ip_local_deliver;
1895 rth->rt_flags |= RTCF_LOCAL;
1896 }
1897
1898#ifdef CONFIG_IP_MROUTE
1899 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1900 rth->u.dst.input = ip_mr_input;
1901#endif
1902 RT_CACHE_STAT_INC(in_slow_mc);
1903
1904 in_dev_put(in_dev);
1905 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1906 return rt_intern_hash(hash, rth, NULL, skb);
1907
1908e_nobufs:
1909 in_dev_put(in_dev);
1910 return -ENOBUFS;
1911
1912e_inval:
1913 in_dev_put(in_dev);
1914 return -EINVAL;
1915}
1916
1917
1918static void ip_handle_martian_source(struct net_device *dev,
1919 struct in_device *in_dev,
1920 struct sk_buff *skb,
1921 __be32 daddr,
1922 __be32 saddr)
1923{
1924 RT_CACHE_STAT_INC(in_martian_src);
1925#ifdef CONFIG_IP_ROUTE_VERBOSE
1926 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1927
1928
1929
1930
1931 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1932 &daddr, &saddr, dev->name);
1933 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1934 int i;
1935 const unsigned char *p = skb_mac_header(skb);
1936 printk(KERN_WARNING "ll header: ");
1937 for (i = 0; i < dev->hard_header_len; i++, p++) {
1938 printk("%02x", *p);
1939 if (i < (dev->hard_header_len - 1))
1940 printk(":");
1941 }
1942 printk("\n");
1943 }
1944 }
1945#endif
1946}
1947
1948static int __mkroute_input(struct sk_buff *skb,
1949 struct fib_result *res,
1950 struct in_device *in_dev,
1951 __be32 daddr, __be32 saddr, u32 tos,
1952 struct rtable **result)
1953{
1954
1955 struct rtable *rth;
1956 int err;
1957 struct in_device *out_dev;
1958 unsigned flags = 0;
1959 __be32 spec_dst;
1960 u32 itag;
1961
1962
1963 out_dev = in_dev_get(FIB_RES_DEV(*res));
1964 if (out_dev == NULL) {
1965 if (net_ratelimit())
1966 printk(KERN_CRIT "Bug in ip_route_input" \
1967 "_slow(). Please, report\n");
1968 return -EINVAL;
1969 }
1970
1971
1972 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1973 in_dev->dev, &spec_dst, &itag, skb->mark);
1974 if (err < 0) {
1975 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1976 saddr);
1977
1978 err = -EINVAL;
1979 goto cleanup;
1980 }
1981
1982 if (err)
1983 flags |= RTCF_DIRECTSRC;
1984
1985 if (out_dev == in_dev && err &&
1986 (IN_DEV_SHARED_MEDIA(out_dev) ||
1987 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1988 flags |= RTCF_DOREDIRECT;
1989
1990 if (skb->protocol != htons(ETH_P_IP)) {
1991
1992
1993
1994 if (out_dev == in_dev) {
1995 err = -EINVAL;
1996 goto cleanup;
1997 }
1998 }
1999
2000
2001 rth = dst_alloc(&ipv4_dst_ops);
2002 if (!rth) {
2003 err = -ENOBUFS;
2004 goto cleanup;
2005 }
2006
2007 atomic_set(&rth->u.dst.__refcnt, 1);
2008 rth->u.dst.flags= DST_HOST;
2009 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2010 rth->u.dst.flags |= DST_NOPOLICY;
2011 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2012 rth->u.dst.flags |= DST_NOXFRM;
2013 rth->fl.fl4_dst = daddr;
2014 rth->rt_dst = daddr;
2015 rth->fl.fl4_tos = tos;
2016 rth->fl.mark = skb->mark;
2017 rth->fl.fl4_src = saddr;
2018 rth->rt_src = saddr;
2019 rth->rt_gateway = daddr;
2020 rth->rt_iif =
2021 rth->fl.iif = in_dev->dev->ifindex;
2022 rth->u.dst.dev = (out_dev)->dev;
2023 dev_hold(rth->u.dst.dev);
2024 rth->idev = in_dev_get(rth->u.dst.dev);
2025 rth->fl.oif = 0;
2026 rth->rt_spec_dst= spec_dst;
2027
2028 rth->u.dst.input = ip_forward;
2029 rth->u.dst.output = ip_output;
2030 rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
2031
2032 rt_set_nexthop(rth, res, itag);
2033
2034 rth->rt_flags = flags;
2035
2036 *result = rth;
2037 err = 0;
2038 cleanup:
2039
2040 in_dev_put(out_dev);
2041 return err;
2042}
2043
2044static int ip_mkroute_input(struct sk_buff *skb,
2045 struct fib_result *res,
2046 const struct flowi *fl,
2047 struct in_device *in_dev,
2048 __be32 daddr, __be32 saddr, u32 tos)
2049{
2050 struct rtable* rth = NULL;
2051 int err;
2052 unsigned hash;
2053
2054#ifdef CONFIG_IP_ROUTE_MULTIPATH
2055 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2056 fib_select_multipath(fl, res);
2057#endif
2058
2059
2060 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2061 if (err)
2062 return err;
2063
2064
2065 hash = rt_hash(daddr, saddr, fl->iif,
2066 rt_genid(dev_net(rth->u.dst.dev)));
2067 return rt_intern_hash(hash, rth, NULL, skb);
2068}
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2081 u8 tos, struct net_device *dev)
2082{
2083 struct fib_result res;
2084 struct in_device *in_dev = in_dev_get(dev);
2085 struct flowi fl = { .nl_u = { .ip4_u =
2086 { .daddr = daddr,
2087 .saddr = saddr,
2088 .tos = tos,
2089 .scope = RT_SCOPE_UNIVERSE,
2090 } },
2091 .mark = skb->mark,
2092 .iif = dev->ifindex };
2093 unsigned flags = 0;
2094 u32 itag = 0;
2095 struct rtable * rth;
2096 unsigned hash;
2097 __be32 spec_dst;
2098 int err = -EINVAL;
2099 int free_res = 0;
2100 struct net * net = dev_net(dev);
2101
2102
2103
2104 if (!in_dev)
2105 goto out;
2106
2107
2108
2109
2110
2111 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2112 ipv4_is_loopback(saddr))
2113 goto martian_source;
2114
2115 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2116 goto brd_input;
2117
2118
2119
2120
2121 if (ipv4_is_zeronet(saddr))
2122 goto martian_source;
2123
2124 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2125 ipv4_is_loopback(daddr))
2126 goto martian_destination;
2127
2128
2129
2130
2131 if ((err = fib_lookup(net, &fl, &res)) != 0) {
2132 if (!IN_DEV_FORWARD(in_dev))
2133 goto e_hostunreach;
2134 goto no_route;
2135 }
2136 free_res = 1;
2137
2138 RT_CACHE_STAT_INC(in_slow_tot);
2139
2140 if (res.type == RTN_BROADCAST)
2141 goto brd_input;
2142
2143 if (res.type == RTN_LOCAL) {
2144 int result;
2145 result = fib_validate_source(saddr, daddr, tos,
2146 net->loopback_dev->ifindex,
2147 dev, &spec_dst, &itag, skb->mark);
2148 if (result < 0)
2149 goto martian_source;
2150 if (result)
2151 flags |= RTCF_DIRECTSRC;
2152 spec_dst = daddr;
2153 goto local_input;
2154 }
2155
2156 if (!IN_DEV_FORWARD(in_dev))
2157 goto e_hostunreach;
2158 if (res.type != RTN_UNICAST)
2159 goto martian_destination;
2160
2161 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2162done:
2163 in_dev_put(in_dev);
2164 if (free_res)
2165 fib_res_put(&res);
2166out: return err;
2167
2168brd_input:
2169 if (skb->protocol != htons(ETH_P_IP))
2170 goto e_inval;
2171
2172 if (ipv4_is_zeronet(saddr))
2173 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2174 else {
2175 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2176 &itag, skb->mark);
2177 if (err < 0)
2178 goto martian_source;
2179 if (err)
2180 flags |= RTCF_DIRECTSRC;
2181 }
2182 flags |= RTCF_BROADCAST;
2183 res.type = RTN_BROADCAST;
2184 RT_CACHE_STAT_INC(in_brd);
2185
2186local_input:
2187 rth = dst_alloc(&ipv4_dst_ops);
2188 if (!rth)
2189 goto e_nobufs;
2190
2191 rth->u.dst.output= ip_rt_bug;
2192 rth->rt_genid = rt_genid(net);
2193
2194 atomic_set(&rth->u.dst.__refcnt, 1);
2195 rth->u.dst.flags= DST_HOST;
2196 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2197 rth->u.dst.flags |= DST_NOPOLICY;
2198 rth->fl.fl4_dst = daddr;
2199 rth->rt_dst = daddr;
2200 rth->fl.fl4_tos = tos;
2201 rth->fl.mark = skb->mark;
2202 rth->fl.fl4_src = saddr;
2203 rth->rt_src = saddr;
2204#ifdef CONFIG_NET_CLS_ROUTE
2205 rth->u.dst.tclassid = itag;
2206#endif
2207 rth->rt_iif =
2208 rth->fl.iif = dev->ifindex;
2209 rth->u.dst.dev = net->loopback_dev;
2210 dev_hold(rth->u.dst.dev);
2211 rth->idev = in_dev_get(rth->u.dst.dev);
2212 rth->rt_gateway = daddr;
2213 rth->rt_spec_dst= spec_dst;
2214 rth->u.dst.input= ip_local_deliver;
2215 rth->rt_flags = flags|RTCF_LOCAL;
2216 if (res.type == RTN_UNREACHABLE) {
2217 rth->u.dst.input= ip_error;
2218 rth->u.dst.error= -err;
2219 rth->rt_flags &= ~RTCF_LOCAL;
2220 }
2221 rth->rt_type = res.type;
2222 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2223 err = rt_intern_hash(hash, rth, NULL, skb);
2224 goto done;
2225
2226no_route:
2227 RT_CACHE_STAT_INC(in_no_route);
2228 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2229 res.type = RTN_UNREACHABLE;
2230 if (err == -ESRCH)
2231 err = -ENETUNREACH;
2232 goto local_input;
2233
2234
2235
2236
2237martian_destination:
2238 RT_CACHE_STAT_INC(in_martian_dst);
2239#ifdef CONFIG_IP_ROUTE_VERBOSE
2240 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2241 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2242 &daddr, &saddr, dev->name);
2243#endif
2244
2245e_hostunreach:
2246 err = -EHOSTUNREACH;
2247 goto done;
2248
2249e_inval:
2250 err = -EINVAL;
2251 goto done;
2252
2253e_nobufs:
2254 err = -ENOBUFS;
2255 goto done;
2256
2257martian_source:
2258 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2259 goto e_inval;
2260}
2261
2262int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2263 u8 tos, struct net_device *dev)
2264{
2265 struct rtable * rth;
2266 unsigned hash;
2267 int iif = dev->ifindex;
2268 struct net *net;
2269
2270 net = dev_net(dev);
2271
2272 if (!rt_caching(net))
2273 goto skip_cache;
2274
2275 tos &= IPTOS_RT_MASK;
2276 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2277
2278 rcu_read_lock();
2279 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2280 rth = rcu_dereference(rth->u.dst.rt_next)) {
2281 if (((rth->fl.fl4_dst ^ daddr) |
2282 (rth->fl.fl4_src ^ saddr) |
2283 (rth->fl.iif ^ iif) |
2284 rth->fl.oif |
2285 (rth->fl.fl4_tos ^ tos)) == 0 &&
2286 rth->fl.mark == skb->mark &&
2287 net_eq(dev_net(rth->u.dst.dev), net) &&
2288 !rt_is_expired(rth)) {
2289 dst_use(&rth->u.dst, jiffies);
2290 RT_CACHE_STAT_INC(in_hit);
2291 rcu_read_unlock();
2292 skb_dst_set(skb, &rth->u.dst);
2293 return 0;
2294 }
2295 RT_CACHE_STAT_INC(in_hlist_search);
2296 }
2297 rcu_read_unlock();
2298
2299skip_cache:
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311 if (ipv4_is_multicast(daddr)) {
2312 struct in_device *in_dev;
2313
2314 rcu_read_lock();
2315 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2316 int our = ip_check_mc(in_dev, daddr, saddr,
2317 ip_hdr(skb)->protocol);
2318 if (our
2319#ifdef CONFIG_IP_MROUTE
2320 ||
2321 (!ipv4_is_local_multicast(daddr) &&
2322 IN_DEV_MFORWARD(in_dev))
2323#endif
2324 ) {
2325 rcu_read_unlock();
2326 return ip_route_input_mc(skb, daddr, saddr,
2327 tos, dev, our);
2328 }
2329 }
2330 rcu_read_unlock();
2331 return -EINVAL;
2332 }
2333 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2334}
2335
2336static int __mkroute_output(struct rtable **result,
2337 struct fib_result *res,
2338 const struct flowi *fl,
2339 const struct flowi *oldflp,
2340 struct net_device *dev_out,
2341 unsigned flags)
2342{
2343 struct rtable *rth;
2344 struct in_device *in_dev;
2345 u32 tos = RT_FL_TOS(oldflp);
2346 int err = 0;
2347
2348 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2349 return -EINVAL;
2350
2351 if (fl->fl4_dst == htonl(0xFFFFFFFF))
2352 res->type = RTN_BROADCAST;
2353 else if (ipv4_is_multicast(fl->fl4_dst))
2354 res->type = RTN_MULTICAST;
2355 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2356 return -EINVAL;
2357
2358 if (dev_out->flags & IFF_LOOPBACK)
2359 flags |= RTCF_LOCAL;
2360
2361
2362 in_dev = in_dev_get(dev_out);
2363 if (!in_dev)
2364 return -EINVAL;
2365
2366 if (res->type == RTN_BROADCAST) {
2367 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2368 if (res->fi) {
2369 fib_info_put(res->fi);
2370 res->fi = NULL;
2371 }
2372 } else if (res->type == RTN_MULTICAST) {
2373 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2374 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2375 oldflp->proto))
2376 flags &= ~RTCF_LOCAL;
2377
2378
2379
2380
2381 if (res->fi && res->prefixlen < 4) {
2382 fib_info_put(res->fi);
2383 res->fi = NULL;
2384 }
2385 }
2386
2387
2388 rth = dst_alloc(&ipv4_dst_ops);
2389 if (!rth) {
2390 err = -ENOBUFS;
2391 goto cleanup;
2392 }
2393
2394 atomic_set(&rth->u.dst.__refcnt, 1);
2395 rth->u.dst.flags= DST_HOST;
2396 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2397 rth->u.dst.flags |= DST_NOXFRM;
2398 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2399 rth->u.dst.flags |= DST_NOPOLICY;
2400
2401 rth->fl.fl4_dst = oldflp->fl4_dst;
2402 rth->fl.fl4_tos = tos;
2403 rth->fl.fl4_src = oldflp->fl4_src;
2404 rth->fl.oif = oldflp->oif;
2405 rth->fl.mark = oldflp->mark;
2406 rth->rt_dst = fl->fl4_dst;
2407 rth->rt_src = fl->fl4_src;
2408 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2409
2410
2411 rth->u.dst.dev = dev_out;
2412 dev_hold(dev_out);
2413 rth->idev = in_dev_get(dev_out);
2414 rth->rt_gateway = fl->fl4_dst;
2415 rth->rt_spec_dst= fl->fl4_src;
2416
2417 rth->u.dst.output=ip_output;
2418 rth->rt_genid = rt_genid(dev_net(dev_out));
2419
2420 RT_CACHE_STAT_INC(out_slow_tot);
2421
2422 if (flags & RTCF_LOCAL) {
2423 rth->u.dst.input = ip_local_deliver;
2424 rth->rt_spec_dst = fl->fl4_dst;
2425 }
2426 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2427 rth->rt_spec_dst = fl->fl4_src;
2428 if (flags & RTCF_LOCAL &&
2429 !(dev_out->flags & IFF_LOOPBACK)) {
2430 rth->u.dst.output = ip_mc_output;
2431 RT_CACHE_STAT_INC(out_slow_mc);
2432 }
2433#ifdef CONFIG_IP_MROUTE
2434 if (res->type == RTN_MULTICAST) {
2435 if (IN_DEV_MFORWARD(in_dev) &&
2436 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2437 rth->u.dst.input = ip_mr_input;
2438 rth->u.dst.output = ip_mc_output;
2439 }
2440 }
2441#endif
2442 }
2443
2444 rt_set_nexthop(rth, res, 0);
2445
2446 rth->rt_flags = flags;
2447
2448 *result = rth;
2449 cleanup:
2450
2451 in_dev_put(in_dev);
2452
2453 return err;
2454}
2455
2456static int ip_mkroute_output(struct rtable **rp,
2457 struct fib_result *res,
2458 const struct flowi *fl,
2459 const struct flowi *oldflp,
2460 struct net_device *dev_out,
2461 unsigned flags)
2462{
2463 struct rtable *rth = NULL;
2464 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2465 unsigned hash;
2466 if (err == 0) {
2467 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2468 rt_genid(dev_net(dev_out)));
2469 err = rt_intern_hash(hash, rth, rp, NULL);
2470 }
2471
2472 return err;
2473}
2474
2475
2476
2477
2478
2479static int ip_route_output_slow(struct net *net, struct rtable **rp,
2480 const struct flowi *oldflp)
2481{
2482 u32 tos = RT_FL_TOS(oldflp);
2483 struct flowi fl = { .nl_u = { .ip4_u =
2484 { .daddr = oldflp->fl4_dst,
2485 .saddr = oldflp->fl4_src,
2486 .tos = tos & IPTOS_RT_MASK,
2487 .scope = ((tos & RTO_ONLINK) ?
2488 RT_SCOPE_LINK :
2489 RT_SCOPE_UNIVERSE),
2490 } },
2491 .mark = oldflp->mark,
2492 .iif = net->loopback_dev->ifindex,
2493 .oif = oldflp->oif };
2494 struct fib_result res;
2495 unsigned flags = 0;
2496 struct net_device *dev_out = NULL;
2497 int free_res = 0;
2498 int err;
2499
2500
2501 res.fi = NULL;
2502#ifdef CONFIG_IP_MULTIPLE_TABLES
2503 res.r = NULL;
2504#endif
2505
2506 if (oldflp->fl4_src) {
2507 err = -EINVAL;
2508 if (ipv4_is_multicast(oldflp->fl4_src) ||
2509 ipv4_is_lbcast(oldflp->fl4_src) ||
2510 ipv4_is_zeronet(oldflp->fl4_src))
2511 goto out;
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521 if (oldflp->oif == 0 &&
2522 (ipv4_is_multicast(oldflp->fl4_dst) ||
2523 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2524
2525 dev_out = ip_dev_find(net, oldflp->fl4_src);
2526 if (dev_out == NULL)
2527 goto out;
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544 fl.oif = dev_out->ifindex;
2545 goto make_route;
2546 }
2547
2548 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2549
2550 dev_out = ip_dev_find(net, oldflp->fl4_src);
2551 if (dev_out == NULL)
2552 goto out;
2553 dev_put(dev_out);
2554 dev_out = NULL;
2555 }
2556 }
2557
2558
2559 if (oldflp->oif) {
2560 dev_out = dev_get_by_index(net, oldflp->oif);
2561 err = -ENODEV;
2562 if (dev_out == NULL)
2563 goto out;
2564
2565
2566 if (__in_dev_get_rtnl(dev_out) == NULL) {
2567 dev_put(dev_out);
2568 goto out;
2569 }
2570
2571 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2572 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2573 if (!fl.fl4_src)
2574 fl.fl4_src = inet_select_addr(dev_out, 0,
2575 RT_SCOPE_LINK);
2576 goto make_route;
2577 }
2578 if (!fl.fl4_src) {
2579 if (ipv4_is_multicast(oldflp->fl4_dst))
2580 fl.fl4_src = inet_select_addr(dev_out, 0,
2581 fl.fl4_scope);
2582 else if (!oldflp->fl4_dst)
2583 fl.fl4_src = inet_select_addr(dev_out, 0,
2584 RT_SCOPE_HOST);
2585 }
2586 }
2587
2588 if (!fl.fl4_dst) {
2589 fl.fl4_dst = fl.fl4_src;
2590 if (!fl.fl4_dst)
2591 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2592 if (dev_out)
2593 dev_put(dev_out);
2594 dev_out = net->loopback_dev;
2595 dev_hold(dev_out);
2596 fl.oif = net->loopback_dev->ifindex;
2597 res.type = RTN_LOCAL;
2598 flags |= RTCF_LOCAL;
2599 goto make_route;
2600 }
2601
2602 if (fib_lookup(net, &fl, &res)) {
2603 res.fi = NULL;
2604 if (oldflp->oif) {
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623 if (fl.fl4_src == 0)
2624 fl.fl4_src = inet_select_addr(dev_out, 0,
2625 RT_SCOPE_LINK);
2626 res.type = RTN_UNICAST;
2627 goto make_route;
2628 }
2629 if (dev_out)
2630 dev_put(dev_out);
2631 err = -ENETUNREACH;
2632 goto out;
2633 }
2634 free_res = 1;
2635
2636 if (res.type == RTN_LOCAL) {
2637 if (!fl.fl4_src)
2638 fl.fl4_src = fl.fl4_dst;
2639 if (dev_out)
2640 dev_put(dev_out);
2641 dev_out = net->loopback_dev;
2642 dev_hold(dev_out);
2643 fl.oif = dev_out->ifindex;
2644 if (res.fi)
2645 fib_info_put(res.fi);
2646 res.fi = NULL;
2647 flags |= RTCF_LOCAL;
2648 goto make_route;
2649 }
2650
2651#ifdef CONFIG_IP_ROUTE_MULTIPATH
2652 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2653 fib_select_multipath(&fl, &res);
2654 else
2655#endif
2656 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2657 fib_select_default(net, &fl, &res);
2658
2659 if (!fl.fl4_src)
2660 fl.fl4_src = FIB_RES_PREFSRC(res);
2661
2662 if (dev_out)
2663 dev_put(dev_out);
2664 dev_out = FIB_RES_DEV(res);
2665 dev_hold(dev_out);
2666 fl.oif = dev_out->ifindex;
2667
2668
2669make_route:
2670 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2671
2672
2673 if (free_res)
2674 fib_res_put(&res);
2675 if (dev_out)
2676 dev_put(dev_out);
2677out: return err;
2678}
2679
2680int __ip_route_output_key(struct net *net, struct rtable **rp,
2681 const struct flowi *flp)
2682{
2683 unsigned hash;
2684 struct rtable *rth;
2685
2686 if (!rt_caching(net))
2687 goto slow_output;
2688
2689 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2690
2691 rcu_read_lock_bh();
2692 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2693 rth = rcu_dereference(rth->u.dst.rt_next)) {
2694 if (rth->fl.fl4_dst == flp->fl4_dst &&
2695 rth->fl.fl4_src == flp->fl4_src &&
2696 rth->fl.iif == 0 &&
2697 rth->fl.oif == flp->oif &&
2698 rth->fl.mark == flp->mark &&
2699 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2700 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2701 net_eq(dev_net(rth->u.dst.dev), net) &&
2702 !rt_is_expired(rth)) {
2703 dst_use(&rth->u.dst, jiffies);
2704 RT_CACHE_STAT_INC(out_hit);
2705 rcu_read_unlock_bh();
2706 *rp = rth;
2707 return 0;
2708 }
2709 RT_CACHE_STAT_INC(out_hlist_search);
2710 }
2711 rcu_read_unlock_bh();
2712
2713slow_output:
2714 return ip_route_output_slow(net, rp, flp);
2715}
2716
2717EXPORT_SYMBOL_GPL(__ip_route_output_key);
2718
2719static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2720{
2721}
2722
2723static struct dst_ops ipv4_dst_blackhole_ops = {
2724 .family = AF_INET,
2725 .protocol = cpu_to_be16(ETH_P_IP),
2726 .destroy = ipv4_dst_destroy,
2727 .check = ipv4_dst_check,
2728 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2729 .entries = ATOMIC_INIT(0),
2730};
2731
2732
2733static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2734{
2735 struct rtable *ort = *rp;
2736 struct rtable *rt = (struct rtable *)
2737 dst_alloc(&ipv4_dst_blackhole_ops);
2738
2739 if (rt) {
2740 struct dst_entry *new = &rt->u.dst;
2741
2742 atomic_set(&new->__refcnt, 1);
2743 new->__use = 1;
2744 new->input = dst_discard;
2745 new->output = dst_discard;
2746 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2747
2748 new->dev = ort->u.dst.dev;
2749 if (new->dev)
2750 dev_hold(new->dev);
2751
2752 rt->fl = ort->fl;
2753
2754 rt->idev = ort->idev;
2755 if (rt->idev)
2756 in_dev_hold(rt->idev);
2757 rt->rt_genid = rt_genid(net);
2758 rt->rt_flags = ort->rt_flags;
2759 rt->rt_type = ort->rt_type;
2760 rt->rt_dst = ort->rt_dst;
2761 rt->rt_src = ort->rt_src;
2762 rt->rt_iif = ort->rt_iif;
2763 rt->rt_gateway = ort->rt_gateway;
2764 rt->rt_spec_dst = ort->rt_spec_dst;
2765 rt->peer = ort->peer;
2766 if (rt->peer)
2767 atomic_inc(&rt->peer->refcnt);
2768
2769 dst_free(new);
2770 }
2771
2772 dst_release(&(*rp)->u.dst);
2773 *rp = rt;
2774 return (rt ? 0 : -ENOMEM);
2775}
2776
2777int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2778 struct sock *sk, int flags)
2779{
2780 int err;
2781
2782 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2783 return err;
2784
2785 if (flp->proto) {
2786 if (!flp->fl4_src)
2787 flp->fl4_src = (*rp)->rt_src;
2788 if (!flp->fl4_dst)
2789 flp->fl4_dst = (*rp)->rt_dst;
2790 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2791 flags ? XFRM_LOOKUP_WAIT : 0);
2792 if (err == -EREMOTE)
2793 err = ipv4_dst_blackhole(net, rp, flp);
2794
2795 return err;
2796 }
2797
2798 return 0;
2799}
2800
2801EXPORT_SYMBOL_GPL(ip_route_output_flow);
2802
2803int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2804{
2805 return ip_route_output_flow(net, rp, flp, NULL, 0);
2806}
2807
2808static int rt_fill_info(struct net *net,
2809 struct sk_buff *skb, u32 pid, u32 seq, int event,
2810 int nowait, unsigned int flags)
2811{
2812 struct rtable *rt = skb_rtable(skb);
2813 struct rtmsg *r;
2814 struct nlmsghdr *nlh;
2815 long expires;
2816 u32 id = 0, ts = 0, tsage = 0, error;
2817
2818 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2819 if (nlh == NULL)
2820 return -EMSGSIZE;
2821
2822 r = nlmsg_data(nlh);
2823 r->rtm_family = AF_INET;
2824 r->rtm_dst_len = 32;
2825 r->rtm_src_len = 0;
2826 r->rtm_tos = rt->fl.fl4_tos;
2827 r->rtm_table = RT_TABLE_MAIN;
2828 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2829 r->rtm_type = rt->rt_type;
2830 r->rtm_scope = RT_SCOPE_UNIVERSE;
2831 r->rtm_protocol = RTPROT_UNSPEC;
2832 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2833 if (rt->rt_flags & RTCF_NOTIFY)
2834 r->rtm_flags |= RTM_F_NOTIFY;
2835
2836 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2837
2838 if (rt->fl.fl4_src) {
2839 r->rtm_src_len = 32;
2840 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2841 }
2842 if (rt->u.dst.dev)
2843 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2844#ifdef CONFIG_NET_CLS_ROUTE
2845 if (rt->u.dst.tclassid)
2846 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2847#endif
2848 if (rt->fl.iif)
2849 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2850 else if (rt->rt_src != rt->fl.fl4_src)
2851 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2852
2853 if (rt->rt_dst != rt->rt_gateway)
2854 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2855
2856 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2857 goto nla_put_failure;
2858
2859 error = rt->u.dst.error;
2860 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2861 if (rt->peer) {
2862 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2863 if (rt->peer->tcp_ts_stamp) {
2864 ts = rt->peer->tcp_ts;
2865 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2866 }
2867 }
2868
2869 if (rt->fl.iif) {
2870#ifdef CONFIG_IP_MROUTE
2871 __be32 dst = rt->rt_dst;
2872
2873 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2874 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2875 int err = ipmr_get_route(net, skb, r, nowait);
2876 if (err <= 0) {
2877 if (!nowait) {
2878 if (err == 0)
2879 return 0;
2880 goto nla_put_failure;
2881 } else {
2882 if (err == -EMSGSIZE)
2883 goto nla_put_failure;
2884 error = err;
2885 }
2886 }
2887 } else
2888#endif
2889 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2890 }
2891
2892 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2893 expires, error) < 0)
2894 goto nla_put_failure;
2895
2896 return nlmsg_end(skb, nlh);
2897
2898nla_put_failure:
2899 nlmsg_cancel(skb, nlh);
2900 return -EMSGSIZE;
2901}
2902
2903static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2904{
2905 struct net *net = sock_net(in_skb->sk);
2906 struct rtmsg *rtm;
2907 struct nlattr *tb[RTA_MAX+1];
2908 struct rtable *rt = NULL;
2909 __be32 dst = 0;
2910 __be32 src = 0;
2911 u32 iif;
2912 int err;
2913 struct sk_buff *skb;
2914
2915 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2916 if (err < 0)
2917 goto errout;
2918
2919 rtm = nlmsg_data(nlh);
2920
2921 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2922 if (skb == NULL) {
2923 err = -ENOBUFS;
2924 goto errout;
2925 }
2926
2927
2928
2929
2930 skb_reset_mac_header(skb);
2931 skb_reset_network_header(skb);
2932
2933
2934 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2935 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2936
2937 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2938 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2939 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2940
2941 if (iif) {
2942 struct net_device *dev;
2943
2944 dev = __dev_get_by_index(net, iif);
2945 if (dev == NULL) {
2946 err = -ENODEV;
2947 goto errout_free;
2948 }
2949
2950 skb->protocol = htons(ETH_P_IP);
2951 skb->dev = dev;
2952 local_bh_disable();
2953 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2954 local_bh_enable();
2955
2956 rt = skb_rtable(skb);
2957 if (err == 0 && rt->u.dst.error)
2958 err = -rt->u.dst.error;
2959 } else {
2960 struct flowi fl = {
2961 .nl_u = {
2962 .ip4_u = {
2963 .daddr = dst,
2964 .saddr = src,
2965 .tos = rtm->rtm_tos,
2966 },
2967 },
2968 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2969 };
2970 err = ip_route_output_key(net, &rt, &fl);
2971 }
2972
2973 if (err)
2974 goto errout_free;
2975
2976 skb_dst_set(skb, &rt->u.dst);
2977 if (rtm->rtm_flags & RTM_F_NOTIFY)
2978 rt->rt_flags |= RTCF_NOTIFY;
2979
2980 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2981 RTM_NEWROUTE, 0, 0);
2982 if (err <= 0)
2983 goto errout_free;
2984
2985 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2986errout:
2987 return err;
2988
2989errout_free:
2990 kfree_skb(skb);
2991 goto errout;
2992}
2993
2994int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2995{
2996 struct rtable *rt;
2997 int h, s_h;
2998 int idx, s_idx;
2999 struct net *net;
3000
3001 net = sock_net(skb->sk);
3002
3003 s_h = cb->args[0];
3004 if (s_h < 0)
3005 s_h = 0;
3006 s_idx = idx = cb->args[1];
3007 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3008 if (!rt_hash_table[h].chain)
3009 continue;
3010 rcu_read_lock_bh();
3011 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
3012 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
3013 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
3014 continue;
3015 if (rt_is_expired(rt))
3016 continue;
3017 skb_dst_set(skb, dst_clone(&rt->u.dst));
3018 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3019 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3020 1, NLM_F_MULTI) <= 0) {
3021 skb_dst_drop(skb);
3022 rcu_read_unlock_bh();
3023 goto done;
3024 }
3025 skb_dst_drop(skb);
3026 }
3027 rcu_read_unlock_bh();
3028 }
3029
3030done:
3031 cb->args[0] = h;
3032 cb->args[1] = idx;
3033 return skb->len;
3034}
3035
3036void ip_rt_multicast_event(struct in_device *in_dev)
3037{
3038 rt_cache_flush(dev_net(in_dev->dev), 0);
3039}
3040
3041#ifdef CONFIG_SYSCTL
3042static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3043 void __user *buffer,
3044 size_t *lenp, loff_t *ppos)
3045{
3046 if (write) {
3047 int flush_delay;
3048 ctl_table ctl;
3049 struct net *net;
3050
3051 memcpy(&ctl, __ctl, sizeof(ctl));
3052 ctl.data = &flush_delay;
3053 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3054
3055 net = (struct net *)__ctl->extra1;
3056 rt_cache_flush(net, flush_delay);
3057 return 0;
3058 }
3059
3060 return -EINVAL;
3061}
3062
3063static void rt_secret_reschedule(int old)
3064{
3065 struct net *net;
3066 int new = ip_rt_secret_interval;
3067 int diff = new - old;
3068
3069 if (!diff)
3070 return;
3071
3072 rtnl_lock();
3073 for_each_net(net) {
3074 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3075
3076 if (!new)
3077 continue;
3078
3079 if (deleted) {
3080 long time = net->ipv4.rt_secret_timer.expires - jiffies;
3081
3082 if (time <= 0 || (time += diff) <= 0)
3083 time = 0;
3084
3085 net->ipv4.rt_secret_timer.expires = time;
3086 } else
3087 net->ipv4.rt_secret_timer.expires = new;
3088
3089 net->ipv4.rt_secret_timer.expires += jiffies;
3090 add_timer(&net->ipv4.rt_secret_timer);
3091 }
3092 rtnl_unlock();
3093}
3094
3095static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3096 void __user *buffer, size_t *lenp,
3097 loff_t *ppos)
3098{
3099 int old = ip_rt_secret_interval;
3100 int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
3101
3102 rt_secret_reschedule(old);
3103
3104 return ret;
3105}
3106
3107static ctl_table ipv4_route_table[] = {
3108 {
3109 .procname = "gc_thresh",
3110 .data = &ipv4_dst_ops.gc_thresh,
3111 .maxlen = sizeof(int),
3112 .mode = 0644,
3113 .proc_handler = proc_dointvec,
3114 },
3115 {
3116 .procname = "max_size",
3117 .data = &ip_rt_max_size,
3118 .maxlen = sizeof(int),
3119 .mode = 0644,
3120 .proc_handler = proc_dointvec,
3121 },
3122 {
3123
3124
3125 .procname = "gc_min_interval",
3126 .data = &ip_rt_gc_min_interval,
3127 .maxlen = sizeof(int),
3128 .mode = 0644,
3129 .proc_handler = proc_dointvec_jiffies,
3130 },
3131 {
3132 .procname = "gc_min_interval_ms",
3133 .data = &ip_rt_gc_min_interval,
3134 .maxlen = sizeof(int),
3135 .mode = 0644,
3136 .proc_handler = proc_dointvec_ms_jiffies,
3137 },
3138 {
3139 .procname = "gc_timeout",
3140 .data = &ip_rt_gc_timeout,
3141 .maxlen = sizeof(int),
3142 .mode = 0644,
3143 .proc_handler = proc_dointvec_jiffies,
3144 },
3145 {
3146 .procname = "gc_interval",
3147 .data = &ip_rt_gc_interval,
3148 .maxlen = sizeof(int),
3149 .mode = 0644,
3150 .proc_handler = proc_dointvec_jiffies,
3151 },
3152 {
3153 .procname = "redirect_load",
3154 .data = &ip_rt_redirect_load,
3155 .maxlen = sizeof(int),
3156 .mode = 0644,
3157 .proc_handler = proc_dointvec,
3158 },
3159 {
3160 .procname = "redirect_number",
3161 .data = &ip_rt_redirect_number,
3162 .maxlen = sizeof(int),
3163 .mode = 0644,
3164 .proc_handler = proc_dointvec,
3165 },
3166 {
3167 .procname = "redirect_silence",
3168 .data = &ip_rt_redirect_silence,
3169 .maxlen = sizeof(int),
3170 .mode = 0644,
3171 .proc_handler = proc_dointvec,
3172 },
3173 {
3174 .procname = "error_cost",
3175 .data = &ip_rt_error_cost,
3176 .maxlen = sizeof(int),
3177 .mode = 0644,
3178 .proc_handler = proc_dointvec,
3179 },
3180 {
3181 .procname = "error_burst",
3182 .data = &ip_rt_error_burst,
3183 .maxlen = sizeof(int),
3184 .mode = 0644,
3185 .proc_handler = proc_dointvec,
3186 },
3187 {
3188 .procname = "gc_elasticity",
3189 .data = &ip_rt_gc_elasticity,
3190 .maxlen = sizeof(int),
3191 .mode = 0644,
3192 .proc_handler = proc_dointvec,
3193 },
3194 {
3195 .procname = "mtu_expires",
3196 .data = &ip_rt_mtu_expires,
3197 .maxlen = sizeof(int),
3198 .mode = 0644,
3199 .proc_handler = proc_dointvec_jiffies,
3200 },
3201 {
3202 .procname = "min_pmtu",
3203 .data = &ip_rt_min_pmtu,
3204 .maxlen = sizeof(int),
3205 .mode = 0644,
3206 .proc_handler = proc_dointvec,
3207 },
3208 {
3209 .procname = "min_adv_mss",
3210 .data = &ip_rt_min_advmss,
3211 .maxlen = sizeof(int),
3212 .mode = 0644,
3213 .proc_handler = proc_dointvec,
3214 },
3215 {
3216 .procname = "secret_interval",
3217 .data = &ip_rt_secret_interval,
3218 .maxlen = sizeof(int),
3219 .mode = 0644,
3220 .proc_handler = ipv4_sysctl_rt_secret_interval,
3221 },
3222 { }
3223};
3224
3225static struct ctl_table empty[1];
3226
3227static struct ctl_table ipv4_skeleton[] =
3228{
3229 { .procname = "route",
3230 .mode = 0555, .child = ipv4_route_table},
3231 { .procname = "neigh",
3232 .mode = 0555, .child = empty},
3233 { }
3234};
3235
3236static __net_initdata struct ctl_path ipv4_path[] = {
3237 { .procname = "net", },
3238 { .procname = "ipv4", },
3239 { },
3240};
3241
3242static struct ctl_table ipv4_route_flush_table[] = {
3243 {
3244 .procname = "flush",
3245 .maxlen = sizeof(int),
3246 .mode = 0200,
3247 .proc_handler = ipv4_sysctl_rtcache_flush,
3248 },
3249 { },
3250};
3251
3252static __net_initdata struct ctl_path ipv4_route_path[] = {
3253 { .procname = "net", },
3254 { .procname = "ipv4", },
3255 { .procname = "route", },
3256 { },
3257};
3258
3259static __net_init int sysctl_route_net_init(struct net *net)
3260{
3261 struct ctl_table *tbl;
3262
3263 tbl = ipv4_route_flush_table;
3264 if (!net_eq(net, &init_net)) {
3265 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3266 if (tbl == NULL)
3267 goto err_dup;
3268 }
3269 tbl[0].extra1 = net;
3270
3271 net->ipv4.route_hdr =
3272 register_net_sysctl_table(net, ipv4_route_path, tbl);
3273 if (net->ipv4.route_hdr == NULL)
3274 goto err_reg;
3275 return 0;
3276
3277err_reg:
3278 if (tbl != ipv4_route_flush_table)
3279 kfree(tbl);
3280err_dup:
3281 return -ENOMEM;
3282}
3283
3284static __net_exit void sysctl_route_net_exit(struct net *net)
3285{
3286 struct ctl_table *tbl;
3287
3288 tbl = net->ipv4.route_hdr->ctl_table_arg;
3289 unregister_net_sysctl_table(net->ipv4.route_hdr);
3290 BUG_ON(tbl == ipv4_route_flush_table);
3291 kfree(tbl);
3292}
3293
3294static __net_initdata struct pernet_operations sysctl_route_ops = {
3295 .init = sysctl_route_net_init,
3296 .exit = sysctl_route_net_exit,
3297};
3298#endif
3299
3300
3301static __net_init int rt_secret_timer_init(struct net *net)
3302{
3303 atomic_set(&net->ipv4.rt_genid,
3304 (int) ((num_physpages ^ (num_physpages>>8)) ^
3305 (jiffies ^ (jiffies >> 7))));
3306
3307 net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3308 net->ipv4.rt_secret_timer.data = (unsigned long)net;
3309 init_timer_deferrable(&net->ipv4.rt_secret_timer);
3310
3311 if (ip_rt_secret_interval) {
3312 net->ipv4.rt_secret_timer.expires =
3313 jiffies + net_random() % ip_rt_secret_interval +
3314 ip_rt_secret_interval;
3315 add_timer(&net->ipv4.rt_secret_timer);
3316 }
3317 return 0;
3318}
3319
3320static __net_exit void rt_secret_timer_exit(struct net *net)
3321{
3322 del_timer_sync(&net->ipv4.rt_secret_timer);
3323}
3324
3325static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3326 .init = rt_secret_timer_init,
3327 .exit = rt_secret_timer_exit,
3328};
3329
3330
3331#ifdef CONFIG_NET_CLS_ROUTE
3332struct ip_rt_acct *ip_rt_acct __read_mostly;
3333#endif
3334
3335static __initdata unsigned long rhash_entries;
3336static int __init set_rhash_entries(char *str)
3337{
3338 if (!str)
3339 return 0;
3340 rhash_entries = simple_strtoul(str, &str, 0);
3341 return 1;
3342}
3343__setup("rhash_entries=", set_rhash_entries);
3344
3345int __init ip_rt_init(void)
3346{
3347 int rc = 0;
3348
3349#ifdef CONFIG_NET_CLS_ROUTE
3350 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3351 if (!ip_rt_acct)
3352 panic("IP: failed to allocate ip_rt_acct\n");
3353#endif
3354
3355 ipv4_dst_ops.kmem_cachep =
3356 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3357 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3358
3359 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3360
3361 rt_hash_table = (struct rt_hash_bucket *)
3362 alloc_large_system_hash("IP route cache",
3363 sizeof(struct rt_hash_bucket),
3364 rhash_entries,
3365 (totalram_pages >= 128 * 1024) ?
3366 15 : 17,
3367 0,
3368 &rt_hash_log,
3369 &rt_hash_mask,
3370 rhash_entries ? 0 : 512 * 1024);
3371 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3372 rt_hash_lock_init();
3373
3374 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3375 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3376
3377 devinet_init();
3378 ip_fib_init();
3379
3380
3381
3382
3383 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3384 expires_ljiffies = jiffies;
3385 schedule_delayed_work(&expires_work,
3386 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3387
3388 if (register_pernet_subsys(&rt_secret_timer_ops))
3389 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3390
3391 if (ip_rt_proc_init())
3392 printk(KERN_ERR "Unable to create route proc files\n");
3393#ifdef CONFIG_XFRM
3394 xfrm_init();
3395 xfrm4_init(ip_rt_max_size);
3396#endif
3397 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3398
3399#ifdef CONFIG_SYSCTL
3400 register_pernet_subsys(&sysctl_route_ops);
3401#endif
3402 return rc;
3403}
3404
3405#ifdef CONFIG_SYSCTL
3406
3407
3408
3409
3410void __init ip_static_sysctl_init(void)
3411{
3412 register_sysctl_paths(ipv4_path, ipv4_skeleton);
3413}
3414#endif
3415
3416EXPORT_SYMBOL(__ip_select_ident);
3417EXPORT_SYMBOL(ip_route_input);
3418EXPORT_SYMBOL(ip_route_output_key);
3419