1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
71#include <linux/mm.h>
72#include <linux/bootmem.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/workqueue.h>
83#include <linux/skbuff.h>
84#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
93#include <net/dst.h>
94#include <net/net_namespace.h>
95#include <net/protocol.h>
96#include <net/ip.h>
97#include <net/route.h>
98#include <net/inetpeer.h>
99#include <net/sock.h>
100#include <net/ip_fib.h>
101#include <net/arp.h>
102#include <net/tcp.h>
103#include <net/icmp.h>
104#include <net/xfrm.h>
105#include <net/netevent.h>
106#include <net/rtnetlink.h>
107#ifdef CONFIG_SYSCTL
108#include <linux/sysctl.h>
109#endif
110
111#define RT_FL_TOS(oldflp) \
112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113
114#define IP_MAX_MTU 0xFFF0
115
116#define RT_GC_TIMEOUT (300*HZ)
117
118static int ip_rt_max_size;
119static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
120static int ip_rt_gc_interval __read_mostly = 60 * HZ;
121static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
122static int ip_rt_redirect_number __read_mostly = 9;
123static int ip_rt_redirect_load __read_mostly = HZ / 50;
124static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125static int ip_rt_error_cost __read_mostly = HZ;
126static int ip_rt_error_burst __read_mostly = 5 * HZ;
127static int ip_rt_gc_elasticity __read_mostly = 8;
128static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
129static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
130static int ip_rt_min_advmss __read_mostly = 256;
131static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
132static int rt_chain_length_max __read_mostly = 20;
133
134static struct delayed_work expires_work;
135static unsigned long expires_ljiffies;
136
137
138
139
140
141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142static void ipv4_dst_destroy(struct dst_entry *dst);
143static void ipv4_dst_ifdown(struct dst_entry *dst,
144 struct net_device *dev, int how);
145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146static void ipv4_link_failure(struct sk_buff *skb);
147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148static int rt_garbage_collect(struct dst_ops *ops);
149static void rt_emergency_hash_rebuild(struct net *net);
150
151
152static struct dst_ops ipv4_dst_ops = {
153 .family = AF_INET,
154 .protocol = cpu_to_be16(ETH_P_IP),
155 .gc = rt_garbage_collect,
156 .check = ipv4_dst_check,
157 .destroy = ipv4_dst_destroy,
158 .ifdown = ipv4_dst_ifdown,
159 .negative_advice = ipv4_negative_advice,
160 .link_failure = ipv4_link_failure,
161 .update_pmtu = ip_rt_update_pmtu,
162 .local_out = __ip_local_out,
163 .entries = ATOMIC_INIT(0),
164};
165
166#define ECN_OR_COST(class) TC_PRIO_##class
167
168const __u8 ip_tos2prio[16] = {
169 TC_PRIO_BESTEFFORT,
170 ECN_OR_COST(FILLER),
171 TC_PRIO_BESTEFFORT,
172 ECN_OR_COST(BESTEFFORT),
173 TC_PRIO_BULK,
174 ECN_OR_COST(BULK),
175 TC_PRIO_BULK,
176 ECN_OR_COST(BULK),
177 TC_PRIO_INTERACTIVE,
178 ECN_OR_COST(INTERACTIVE),
179 TC_PRIO_INTERACTIVE,
180 ECN_OR_COST(INTERACTIVE),
181 TC_PRIO_INTERACTIVE_BULK,
182 ECN_OR_COST(INTERACTIVE_BULK),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK)
185};
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202struct rt_hash_bucket {
203 struct rtable *chain;
204};
205
206#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
207 defined(CONFIG_PROVE_LOCKING)
208
209
210
211
212
213#ifdef CONFIG_LOCKDEP
214# define RT_HASH_LOCK_SZ 256
215#else
216# if NR_CPUS >= 32
217# define RT_HASH_LOCK_SZ 4096
218# elif NR_CPUS >= 16
219# define RT_HASH_LOCK_SZ 2048
220# elif NR_CPUS >= 8
221# define RT_HASH_LOCK_SZ 1024
222# elif NR_CPUS >= 4
223# define RT_HASH_LOCK_SZ 512
224# else
225# define RT_HASH_LOCK_SZ 256
226# endif
227#endif
228
229static spinlock_t *rt_hash_locks;
230# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
231
232static __init void rt_hash_lock_init(void)
233{
234 int i;
235
236 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
237 GFP_KERNEL);
238 if (!rt_hash_locks)
239 panic("IP: failed to allocate rt_hash_locks\n");
240
241 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
242 spin_lock_init(&rt_hash_locks[i]);
243}
244#else
245# define rt_hash_lock_addr(slot) NULL
246
247static inline void rt_hash_lock_init(void)
248{
249}
250#endif
251
252static struct rt_hash_bucket *rt_hash_table __read_mostly;
253static unsigned rt_hash_mask __read_mostly;
254static unsigned int rt_hash_log __read_mostly;
255
256static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
257#define RT_CACHE_STAT_INC(field) \
258 (__raw_get_cpu_var(rt_cache_stat).field++)
259
260static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
261 int genid)
262{
263 return jhash_3words((__force u32)(__be32)(daddr),
264 (__force u32)(__be32)(saddr),
265 idx, genid)
266 & rt_hash_mask;
267}
268
269static inline int rt_genid(struct net *net)
270{
271 return atomic_read(&net->ipv4.rt_genid);
272}
273
274#ifdef CONFIG_PROC_FS
275struct rt_cache_iter_state {
276 struct seq_net_private p;
277 int bucket;
278 int genid;
279};
280
281static struct rtable *rt_cache_get_first(struct seq_file *seq)
282{
283 struct rt_cache_iter_state *st = seq->private;
284 struct rtable *r = NULL;
285
286 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
287 if (!rt_hash_table[st->bucket].chain)
288 continue;
289 rcu_read_lock_bh();
290 r = rcu_dereference(rt_hash_table[st->bucket].chain);
291 while (r) {
292 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
293 r->rt_genid == st->genid)
294 return r;
295 r = rcu_dereference(r->u.dst.rt_next);
296 }
297 rcu_read_unlock_bh();
298 }
299 return r;
300}
301
302static struct rtable *__rt_cache_get_next(struct seq_file *seq,
303 struct rtable *r)
304{
305 struct rt_cache_iter_state *st = seq->private;
306
307 r = r->u.dst.rt_next;
308 while (!r) {
309 rcu_read_unlock_bh();
310 do {
311 if (--st->bucket < 0)
312 return NULL;
313 } while (!rt_hash_table[st->bucket].chain);
314 rcu_read_lock_bh();
315 r = rt_hash_table[st->bucket].chain;
316 }
317 return rcu_dereference(r);
318}
319
320static struct rtable *rt_cache_get_next(struct seq_file *seq,
321 struct rtable *r)
322{
323 struct rt_cache_iter_state *st = seq->private;
324 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
325 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
326 continue;
327 if (r->rt_genid == st->genid)
328 break;
329 }
330 return r;
331}
332
333static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
334{
335 struct rtable *r = rt_cache_get_first(seq);
336
337 if (r)
338 while (pos && (r = rt_cache_get_next(seq, r)))
339 --pos;
340 return pos ? NULL : r;
341}
342
343static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
344{
345 struct rt_cache_iter_state *st = seq->private;
346 if (*pos)
347 return rt_cache_get_idx(seq, *pos - 1);
348 st->genid = rt_genid(seq_file_net(seq));
349 return SEQ_START_TOKEN;
350}
351
352static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
353{
354 struct rtable *r;
355
356 if (v == SEQ_START_TOKEN)
357 r = rt_cache_get_first(seq);
358 else
359 r = rt_cache_get_next(seq, v);
360 ++*pos;
361 return r;
362}
363
364static void rt_cache_seq_stop(struct seq_file *seq, void *v)
365{
366 if (v && v != SEQ_START_TOKEN)
367 rcu_read_unlock_bh();
368}
369
370static int rt_cache_seq_show(struct seq_file *seq, void *v)
371{
372 if (v == SEQ_START_TOKEN)
373 seq_printf(seq, "%-127s\n",
374 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
375 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
376 "HHUptod\tSpecDst");
377 else {
378 struct rtable *r = v;
379 int len;
380
381 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
382 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
383 r->u.dst.dev ? r->u.dst.dev->name : "*",
384 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
385 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
386 r->u.dst.__use, 0, (unsigned long)r->rt_src,
387 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
388 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
389 dst_metric(&r->u.dst, RTAX_WINDOW),
390 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
391 dst_metric(&r->u.dst, RTAX_RTTVAR)),
392 r->fl.fl4_tos,
393 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
394 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
395 dev_queue_xmit) : 0,
396 r->rt_spec_dst, &len);
397
398 seq_printf(seq, "%*s\n", 127 - len, "");
399 }
400 return 0;
401}
402
403static const struct seq_operations rt_cache_seq_ops = {
404 .start = rt_cache_seq_start,
405 .next = rt_cache_seq_next,
406 .stop = rt_cache_seq_stop,
407 .show = rt_cache_seq_show,
408};
409
410static int rt_cache_seq_open(struct inode *inode, struct file *file)
411{
412 return seq_open_net(inode, file, &rt_cache_seq_ops,
413 sizeof(struct rt_cache_iter_state));
414}
415
416static const struct file_operations rt_cache_seq_fops = {
417 .owner = THIS_MODULE,
418 .open = rt_cache_seq_open,
419 .read = seq_read,
420 .llseek = seq_lseek,
421 .release = seq_release_net,
422};
423
424
425static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
426{
427 int cpu;
428
429 if (*pos == 0)
430 return SEQ_START_TOKEN;
431
432 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
433 if (!cpu_possible(cpu))
434 continue;
435 *pos = cpu+1;
436 return &per_cpu(rt_cache_stat, cpu);
437 }
438 return NULL;
439}
440
441static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
442{
443 int cpu;
444
445 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
446 if (!cpu_possible(cpu))
447 continue;
448 *pos = cpu+1;
449 return &per_cpu(rt_cache_stat, cpu);
450 }
451 return NULL;
452
453}
454
455static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
456{
457
458}
459
460static int rt_cpu_seq_show(struct seq_file *seq, void *v)
461{
462 struct rt_cache_stat *st = v;
463
464 if (v == SEQ_START_TOKEN) {
465 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
466 return 0;
467 }
468
469 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
470 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
471 atomic_read(&ipv4_dst_ops.entries),
472 st->in_hit,
473 st->in_slow_tot,
474 st->in_slow_mc,
475 st->in_no_route,
476 st->in_brd,
477 st->in_martian_dst,
478 st->in_martian_src,
479
480 st->out_hit,
481 st->out_slow_tot,
482 st->out_slow_mc,
483
484 st->gc_total,
485 st->gc_ignored,
486 st->gc_goal_miss,
487 st->gc_dst_overflow,
488 st->in_hlist_search,
489 st->out_hlist_search
490 );
491 return 0;
492}
493
494static const struct seq_operations rt_cpu_seq_ops = {
495 .start = rt_cpu_seq_start,
496 .next = rt_cpu_seq_next,
497 .stop = rt_cpu_seq_stop,
498 .show = rt_cpu_seq_show,
499};
500
501
502static int rt_cpu_seq_open(struct inode *inode, struct file *file)
503{
504 return seq_open(file, &rt_cpu_seq_ops);
505}
506
507static const struct file_operations rt_cpu_seq_fops = {
508 .owner = THIS_MODULE,
509 .open = rt_cpu_seq_open,
510 .read = seq_read,
511 .llseek = seq_lseek,
512 .release = seq_release,
513};
514
515#ifdef CONFIG_NET_CLS_ROUTE
516static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
517 int length, int *eof, void *data)
518{
519 unsigned int i;
520
521 if ((offset & 3) || (length & 3))
522 return -EIO;
523
524 if (offset >= sizeof(struct ip_rt_acct) * 256) {
525 *eof = 1;
526 return 0;
527 }
528
529 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
530 length = sizeof(struct ip_rt_acct) * 256 - offset;
531 *eof = 1;
532 }
533
534 offset /= sizeof(u32);
535
536 if (length > 0) {
537 u32 *dst = (u32 *) buffer;
538
539 *start = buffer;
540 memset(dst, 0, length);
541
542 for_each_possible_cpu(i) {
543 unsigned int j;
544 u32 *src;
545
546 src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
547 for (j = 0; j < length/4; j++)
548 dst[j] += src[j];
549 }
550 }
551 return length;
552}
553#endif
554
555static int __net_init ip_rt_do_proc_init(struct net *net)
556{
557 struct proc_dir_entry *pde;
558
559 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
560 &rt_cache_seq_fops);
561 if (!pde)
562 goto err1;
563
564 pde = proc_create("rt_cache", S_IRUGO,
565 net->proc_net_stat, &rt_cpu_seq_fops);
566 if (!pde)
567 goto err2;
568
569#ifdef CONFIG_NET_CLS_ROUTE
570 pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
571 ip_rt_acct_read, NULL);
572 if (!pde)
573 goto err3;
574#endif
575 return 0;
576
577#ifdef CONFIG_NET_CLS_ROUTE
578err3:
579 remove_proc_entry("rt_cache", net->proc_net_stat);
580#endif
581err2:
582 remove_proc_entry("rt_cache", net->proc_net);
583err1:
584 return -ENOMEM;
585}
586
587static void __net_exit ip_rt_do_proc_exit(struct net *net)
588{
589 remove_proc_entry("rt_cache", net->proc_net_stat);
590 remove_proc_entry("rt_cache", net->proc_net);
591 remove_proc_entry("rt_acct", net->proc_net);
592}
593
594static struct pernet_operations ip_rt_proc_ops __net_initdata = {
595 .init = ip_rt_do_proc_init,
596 .exit = ip_rt_do_proc_exit,
597};
598
599static int __init ip_rt_proc_init(void)
600{
601 return register_pernet_subsys(&ip_rt_proc_ops);
602}
603
604#else
605static inline int ip_rt_proc_init(void)
606{
607 return 0;
608}
609#endif
610
611static inline void rt_free(struct rtable *rt)
612{
613 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
614}
615
616static inline void rt_drop(struct rtable *rt)
617{
618 ip_rt_put(rt);
619 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
620}
621
622static inline int rt_fast_clean(struct rtable *rth)
623{
624
625
626 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
627 rth->fl.iif && rth->u.dst.rt_next;
628}
629
630static inline int rt_valuable(struct rtable *rth)
631{
632 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
633 rth->u.dst.expires;
634}
635
636static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
637{
638 unsigned long age;
639 int ret = 0;
640
641 if (atomic_read(&rth->u.dst.__refcnt))
642 goto out;
643
644 ret = 1;
645 if (rth->u.dst.expires &&
646 time_after_eq(jiffies, rth->u.dst.expires))
647 goto out;
648
649 age = jiffies - rth->u.dst.lastuse;
650 ret = 0;
651 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
652 (age <= tmo2 && rt_valuable(rth)))
653 goto out;
654 ret = 1;
655out: return ret;
656}
657
658
659
660
661
662
663static inline u32 rt_score(struct rtable *rt)
664{
665 u32 score = jiffies - rt->u.dst.lastuse;
666
667 score = ~score & ~(3<<30);
668
669 if (rt_valuable(rt))
670 score |= (1<<31);
671
672 if (!rt->fl.iif ||
673 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
674 score |= (1<<30);
675
676 return score;
677}
678
679static inline bool rt_caching(const struct net *net)
680{
681 return net->ipv4.current_rt_cache_rebuild_count <=
682 net->ipv4.sysctl_rt_cache_rebuild_count;
683}
684
685static inline bool compare_hash_inputs(const struct flowi *fl1,
686 const struct flowi *fl2)
687{
688 return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
689 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
690 (fl1->iif ^ fl2->iif)) == 0);
691}
692
693static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
694{
695 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
696 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
697 (fl1->mark ^ fl2->mark) |
698 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
699 *(u16 *)&fl2->nl_u.ip4_u.tos) |
700 (fl1->oif ^ fl2->oif) |
701 (fl1->iif ^ fl2->iif)) == 0;
702}
703
704static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
705{
706 return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
707}
708
709static inline int rt_is_expired(struct rtable *rth)
710{
711 return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
712}
713
714
715
716
717
718
719static void rt_do_flush(int process_context)
720{
721 unsigned int i;
722 struct rtable *rth, *next;
723 struct rtable * tail;
724
725 for (i = 0; i <= rt_hash_mask; i++) {
726 if (process_context && need_resched())
727 cond_resched();
728 rth = rt_hash_table[i].chain;
729 if (!rth)
730 continue;
731
732 spin_lock_bh(rt_hash_lock_addr(i));
733#ifdef CONFIG_NET_NS
734 {
735 struct rtable ** prev, * p;
736
737 rth = rt_hash_table[i].chain;
738
739
740 for (tail = rth; tail; tail = tail->u.dst.rt_next)
741 if (!rt_is_expired(tail))
742 break;
743 if (rth != tail)
744 rt_hash_table[i].chain = tail;
745
746
747 prev = &rt_hash_table[i].chain;
748 for (p = *prev; p; p = next) {
749 next = p->u.dst.rt_next;
750 if (!rt_is_expired(p)) {
751 prev = &p->u.dst.rt_next;
752 } else {
753 *prev = next;
754 rt_free(p);
755 }
756 }
757 }
758#else
759 rth = rt_hash_table[i].chain;
760 rt_hash_table[i].chain = NULL;
761 tail = NULL;
762#endif
763 spin_unlock_bh(rt_hash_lock_addr(i));
764
765 for (; rth != tail; rth = next) {
766 next = rth->u.dst.rt_next;
767 rt_free(rth);
768 }
769 }
770}
771
772
773
774
775
776
777
778
779
780#define FRACT_BITS 3
781#define ONE (1UL << FRACT_BITS)
782
783static void rt_check_expire(void)
784{
785 static unsigned int rover;
786 unsigned int i = rover, goal;
787 struct rtable *rth, *aux, **rthp;
788 unsigned long samples = 0;
789 unsigned long sum = 0, sum2 = 0;
790 unsigned long delta;
791 u64 mult;
792
793 delta = jiffies - expires_ljiffies;
794 expires_ljiffies = jiffies;
795 mult = ((u64)delta) << rt_hash_log;
796 if (ip_rt_gc_timeout > 1)
797 do_div(mult, ip_rt_gc_timeout);
798 goal = (unsigned int)mult;
799 if (goal > rt_hash_mask)
800 goal = rt_hash_mask + 1;
801 for (; goal > 0; goal--) {
802 unsigned long tmo = ip_rt_gc_timeout;
803 unsigned long length;
804
805 i = (i + 1) & rt_hash_mask;
806 rthp = &rt_hash_table[i].chain;
807
808 if (need_resched())
809 cond_resched();
810
811 samples++;
812
813 if (*rthp == NULL)
814 continue;
815 length = 0;
816 spin_lock_bh(rt_hash_lock_addr(i));
817 while ((rth = *rthp) != NULL) {
818 prefetch(rth->u.dst.rt_next);
819 if (rt_is_expired(rth)) {
820 *rthp = rth->u.dst.rt_next;
821 rt_free(rth);
822 continue;
823 }
824 if (rth->u.dst.expires) {
825
826 if (time_before_eq(jiffies, rth->u.dst.expires)) {
827nofree:
828 tmo >>= 1;
829 rthp = &rth->u.dst.rt_next;
830
831
832
833
834
835
836
837
838 for (aux = rt_hash_table[i].chain;;) {
839 if (aux == rth) {
840 length += ONE;
841 break;
842 }
843 if (compare_hash_inputs(&aux->fl, &rth->fl))
844 break;
845 aux = aux->u.dst.rt_next;
846 }
847 continue;
848 }
849 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
850 goto nofree;
851
852
853 *rthp = rth->u.dst.rt_next;
854 rt_free(rth);
855 }
856 spin_unlock_bh(rt_hash_lock_addr(i));
857 sum += length;
858 sum2 += length*length;
859 }
860 if (samples) {
861 unsigned long avg = sum / samples;
862 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
863 rt_chain_length_max = max_t(unsigned long,
864 ip_rt_gc_elasticity,
865 (avg + 4*sd) >> FRACT_BITS);
866 }
867 rover = i;
868}
869
870
871
872
873
874static void rt_worker_func(struct work_struct *work)
875{
876 rt_check_expire();
877 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
878}
879
880
881
882
883
884
885
886static void rt_cache_invalidate(struct net *net)
887{
888 unsigned char shuffle;
889
890 get_random_bytes(&shuffle, sizeof(shuffle));
891 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
892}
893
894
895
896
897
898void rt_cache_flush(struct net *net, int delay)
899{
900 rt_cache_invalidate(net);
901 if (delay >= 0)
902 rt_do_flush(!in_softirq());
903}
904
905
906
907
908static void rt_secret_rebuild(unsigned long __net)
909{
910 struct net *net = (struct net *)__net;
911 rt_cache_invalidate(net);
912 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
913}
914
915static void rt_secret_rebuild_oneshot(struct net *net)
916{
917 del_timer_sync(&net->ipv4.rt_secret_timer);
918 rt_cache_invalidate(net);
919 if (ip_rt_secret_interval) {
920 net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
921 add_timer(&net->ipv4.rt_secret_timer);
922 }
923}
924
925static void rt_emergency_hash_rebuild(struct net *net)
926{
927 if (net_ratelimit()) {
928 printk(KERN_WARNING "Route hash chain too long!\n");
929 printk(KERN_WARNING "Adjust your secret_interval!\n");
930 }
931
932 rt_secret_rebuild_oneshot(net);
933}
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948static int rt_garbage_collect(struct dst_ops *ops)
949{
950 static unsigned long expire = RT_GC_TIMEOUT;
951 static unsigned long last_gc;
952 static int rover;
953 static int equilibrium;
954 struct rtable *rth, **rthp;
955 unsigned long now = jiffies;
956 int goal;
957
958
959
960
961
962
963 RT_CACHE_STAT_INC(gc_total);
964
965 if (now - last_gc < ip_rt_gc_min_interval &&
966 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
967 RT_CACHE_STAT_INC(gc_ignored);
968 goto out;
969 }
970
971
972 goal = atomic_read(&ipv4_dst_ops.entries) -
973 (ip_rt_gc_elasticity << rt_hash_log);
974 if (goal <= 0) {
975 if (equilibrium < ipv4_dst_ops.gc_thresh)
976 equilibrium = ipv4_dst_ops.gc_thresh;
977 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
978 if (goal > 0) {
979 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
980 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
981 }
982 } else {
983
984
985
986 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
987 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
988 }
989
990 if (now - last_gc >= ip_rt_gc_min_interval)
991 last_gc = now;
992
993 if (goal <= 0) {
994 equilibrium += goal;
995 goto work_done;
996 }
997
998 do {
999 int i, k;
1000
1001 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1002 unsigned long tmo = expire;
1003
1004 k = (k + 1) & rt_hash_mask;
1005 rthp = &rt_hash_table[k].chain;
1006 spin_lock_bh(rt_hash_lock_addr(k));
1007 while ((rth = *rthp) != NULL) {
1008 if (!rt_is_expired(rth) &&
1009 !rt_may_expire(rth, tmo, expire)) {
1010 tmo >>= 1;
1011 rthp = &rth->u.dst.rt_next;
1012 continue;
1013 }
1014 *rthp = rth->u.dst.rt_next;
1015 rt_free(rth);
1016 goal--;
1017 }
1018 spin_unlock_bh(rt_hash_lock_addr(k));
1019 if (goal <= 0)
1020 break;
1021 }
1022 rover = k;
1023
1024 if (goal <= 0)
1025 goto work_done;
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036 RT_CACHE_STAT_INC(gc_goal_miss);
1037
1038 if (expire == 0)
1039 break;
1040
1041 expire >>= 1;
1042#if RT_CACHE_DEBUG >= 2
1043 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1044 atomic_read(&ipv4_dst_ops.entries), goal, i);
1045#endif
1046
1047 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1048 goto out;
1049 } while (!in_softirq() && time_before_eq(jiffies, now));
1050
1051 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1052 goto out;
1053 if (net_ratelimit())
1054 printk(KERN_WARNING "dst cache overflow\n");
1055 RT_CACHE_STAT_INC(gc_dst_overflow);
1056 return 1;
1057
1058work_done:
1059 expire += ip_rt_gc_min_interval;
1060 if (expire > ip_rt_gc_timeout ||
1061 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1062 expire = ip_rt_gc_timeout;
1063#if RT_CACHE_DEBUG >= 2
1064 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1065 atomic_read(&ipv4_dst_ops.entries), goal, rover);
1066#endif
1067out: return 0;
1068}
1069
1070static int rt_intern_hash(unsigned hash, struct rtable *rt,
1071 struct rtable **rp, struct sk_buff *skb)
1072{
1073 struct rtable *rth, **rthp;
1074 unsigned long now;
1075 struct rtable *cand, **candp;
1076 u32 min_score;
1077 int chain_length;
1078 int attempts = !in_softirq();
1079
1080restart:
1081 chain_length = 0;
1082 min_score = ~(u32)0;
1083 cand = NULL;
1084 candp = NULL;
1085 now = jiffies;
1086
1087 if (!rt_caching(dev_net(rt->u.dst.dev))) {
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1105 int err = arp_bind_neighbour(&rt->u.dst);
1106 if (err) {
1107 if (net_ratelimit())
1108 printk(KERN_WARNING
1109 "Neighbour table failure & not caching routes.\n");
1110 rt_drop(rt);
1111 return err;
1112 }
1113 }
1114
1115 rt_free(rt);
1116 goto skip_hashing;
1117 }
1118
1119 rthp = &rt_hash_table[hash].chain;
1120
1121 spin_lock_bh(rt_hash_lock_addr(hash));
1122 while ((rth = *rthp) != NULL) {
1123 if (rt_is_expired(rth)) {
1124 *rthp = rth->u.dst.rt_next;
1125 rt_free(rth);
1126 continue;
1127 }
1128 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1129
1130 *rthp = rth->u.dst.rt_next;
1131
1132
1133
1134
1135
1136 rcu_assign_pointer(rth->u.dst.rt_next,
1137 rt_hash_table[hash].chain);
1138
1139
1140
1141
1142 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1143
1144 dst_use(&rth->u.dst, now);
1145 spin_unlock_bh(rt_hash_lock_addr(hash));
1146
1147 rt_drop(rt);
1148 if (rp)
1149 *rp = rth;
1150 else
1151 skb_dst_set(skb, &rth->u.dst);
1152 return 0;
1153 }
1154
1155 if (!atomic_read(&rth->u.dst.__refcnt)) {
1156 u32 score = rt_score(rth);
1157
1158 if (score <= min_score) {
1159 cand = rth;
1160 candp = rthp;
1161 min_score = score;
1162 }
1163 }
1164
1165 chain_length++;
1166
1167 rthp = &rth->u.dst.rt_next;
1168 }
1169
1170 if (cand) {
1171
1172
1173
1174
1175
1176
1177 if (chain_length > ip_rt_gc_elasticity) {
1178 *candp = cand->u.dst.rt_next;
1179 rt_free(cand);
1180 }
1181 } else {
1182 if (chain_length > rt_chain_length_max) {
1183 struct net *net = dev_net(rt->u.dst.dev);
1184 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1185 if (!rt_caching(dev_net(rt->u.dst.dev))) {
1186 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1187 rt->u.dst.dev->name, num);
1188 }
1189 rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1190 }
1191 }
1192
1193
1194
1195
1196 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1197 int err = arp_bind_neighbour(&rt->u.dst);
1198 if (err) {
1199 spin_unlock_bh(rt_hash_lock_addr(hash));
1200
1201 if (err != -ENOBUFS) {
1202 rt_drop(rt);
1203 return err;
1204 }
1205
1206
1207
1208
1209
1210 if (attempts-- > 0) {
1211 int saved_elasticity = ip_rt_gc_elasticity;
1212 int saved_int = ip_rt_gc_min_interval;
1213 ip_rt_gc_elasticity = 1;
1214 ip_rt_gc_min_interval = 0;
1215 rt_garbage_collect(&ipv4_dst_ops);
1216 ip_rt_gc_min_interval = saved_int;
1217 ip_rt_gc_elasticity = saved_elasticity;
1218 goto restart;
1219 }
1220
1221 if (net_ratelimit())
1222 printk(KERN_WARNING "Neighbour table overflow.\n");
1223 rt_drop(rt);
1224 return -ENOBUFS;
1225 }
1226 }
1227
1228 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1229
1230#if RT_CACHE_DEBUG >= 2
1231 if (rt->u.dst.rt_next) {
1232 struct rtable *trt;
1233 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1234 hash, &rt->rt_dst);
1235 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1236 printk(" . %pI4", &trt->rt_dst);
1237 printk("\n");
1238 }
1239#endif
1240
1241
1242
1243
1244
1245 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1246
1247 spin_unlock_bh(rt_hash_lock_addr(hash));
1248
1249skip_hashing:
1250 if (rp)
1251 *rp = rt;
1252 else
1253 skb_dst_set(skb, &rt->u.dst);
1254 return 0;
1255}
1256
1257void rt_bind_peer(struct rtable *rt, int create)
1258{
1259 static DEFINE_SPINLOCK(rt_peer_lock);
1260 struct inet_peer *peer;
1261
1262 peer = inet_getpeer(rt->rt_dst, create);
1263
1264 spin_lock_bh(&rt_peer_lock);
1265 if (rt->peer == NULL) {
1266 rt->peer = peer;
1267 peer = NULL;
1268 }
1269 spin_unlock_bh(&rt_peer_lock);
1270 if (peer)
1271 inet_putpeer(peer);
1272}
1273
1274
1275
1276
1277
1278
1279
1280
1281static void ip_select_fb_ident(struct iphdr *iph)
1282{
1283 static DEFINE_SPINLOCK(ip_fb_id_lock);
1284 static u32 ip_fallback_id;
1285 u32 salt;
1286
1287 spin_lock_bh(&ip_fb_id_lock);
1288 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1289 iph->id = htons(salt & 0xFFFF);
1290 ip_fallback_id = salt;
1291 spin_unlock_bh(&ip_fb_id_lock);
1292}
1293
1294void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1295{
1296 struct rtable *rt = (struct rtable *) dst;
1297
1298 if (rt) {
1299 if (rt->peer == NULL)
1300 rt_bind_peer(rt, 1);
1301
1302
1303
1304
1305 if (rt->peer) {
1306 iph->id = htons(inet_getid(rt->peer, more));
1307 return;
1308 }
1309 } else
1310 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1311 __builtin_return_address(0));
1312
1313 ip_select_fb_ident(iph);
1314}
1315
1316static void rt_del(unsigned hash, struct rtable *rt)
1317{
1318 struct rtable **rthp, *aux;
1319
1320 rthp = &rt_hash_table[hash].chain;
1321 spin_lock_bh(rt_hash_lock_addr(hash));
1322 ip_rt_put(rt);
1323 while ((aux = *rthp) != NULL) {
1324 if (aux == rt || rt_is_expired(aux)) {
1325 *rthp = aux->u.dst.rt_next;
1326 rt_free(aux);
1327 continue;
1328 }
1329 rthp = &aux->u.dst.rt_next;
1330 }
1331 spin_unlock_bh(rt_hash_lock_addr(hash));
1332}
1333
1334void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1335 __be32 saddr, struct net_device *dev)
1336{
1337 int i, k;
1338 struct in_device *in_dev = in_dev_get(dev);
1339 struct rtable *rth, **rthp;
1340 __be32 skeys[2] = { saddr, 0 };
1341 int ikeys[2] = { dev->ifindex, 0 };
1342 struct netevent_redirect netevent;
1343 struct net *net;
1344
1345 if (!in_dev)
1346 return;
1347
1348 net = dev_net(dev);
1349 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1350 || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1351 || ipv4_is_zeronet(new_gw))
1352 goto reject_redirect;
1353
1354 if (!rt_caching(net))
1355 goto reject_redirect;
1356
1357 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1358 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1359 goto reject_redirect;
1360 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1361 goto reject_redirect;
1362 } else {
1363 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1364 goto reject_redirect;
1365 }
1366
1367 for (i = 0; i < 2; i++) {
1368 for (k = 0; k < 2; k++) {
1369 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1370 rt_genid(net));
1371
1372 rthp=&rt_hash_table[hash].chain;
1373
1374 rcu_read_lock();
1375 while ((rth = rcu_dereference(*rthp)) != NULL) {
1376 struct rtable *rt;
1377
1378 if (rth->fl.fl4_dst != daddr ||
1379 rth->fl.fl4_src != skeys[i] ||
1380 rth->fl.oif != ikeys[k] ||
1381 rth->fl.iif != 0 ||
1382 rt_is_expired(rth) ||
1383 !net_eq(dev_net(rth->u.dst.dev), net)) {
1384 rthp = &rth->u.dst.rt_next;
1385 continue;
1386 }
1387
1388 if (rth->rt_dst != daddr ||
1389 rth->rt_src != saddr ||
1390 rth->u.dst.error ||
1391 rth->rt_gateway != old_gw ||
1392 rth->u.dst.dev != dev)
1393 break;
1394
1395 dst_hold(&rth->u.dst);
1396 rcu_read_unlock();
1397
1398 rt = dst_alloc(&ipv4_dst_ops);
1399 if (rt == NULL) {
1400 ip_rt_put(rth);
1401 in_dev_put(in_dev);
1402 return;
1403 }
1404
1405
1406 *rt = *rth;
1407 rt->u.dst.__use = 1;
1408 atomic_set(&rt->u.dst.__refcnt, 1);
1409 rt->u.dst.child = NULL;
1410 if (rt->u.dst.dev)
1411 dev_hold(rt->u.dst.dev);
1412 if (rt->idev)
1413 in_dev_hold(rt->idev);
1414 rt->u.dst.obsolete = 0;
1415 rt->u.dst.lastuse = jiffies;
1416 rt->u.dst.path = &rt->u.dst;
1417 rt->u.dst.neighbour = NULL;
1418 rt->u.dst.hh = NULL;
1419#ifdef CONFIG_XFRM
1420 rt->u.dst.xfrm = NULL;
1421#endif
1422 rt->rt_genid = rt_genid(net);
1423 rt->rt_flags |= RTCF_REDIRECTED;
1424
1425
1426 rt->rt_gateway = new_gw;
1427
1428
1429 dst_confirm(&rth->u.dst);
1430
1431 if (rt->peer)
1432 atomic_inc(&rt->peer->refcnt);
1433
1434 if (arp_bind_neighbour(&rt->u.dst) ||
1435 !(rt->u.dst.neighbour->nud_state &
1436 NUD_VALID)) {
1437 if (rt->u.dst.neighbour)
1438 neigh_event_send(rt->u.dst.neighbour, NULL);
1439 ip_rt_put(rth);
1440 rt_drop(rt);
1441 goto do_next;
1442 }
1443
1444 netevent.old = &rth->u.dst;
1445 netevent.new = &rt->u.dst;
1446 call_netevent_notifiers(NETEVENT_REDIRECT,
1447 &netevent);
1448
1449 rt_del(hash, rth);
1450 if (!rt_intern_hash(hash, rt, &rt, NULL))
1451 ip_rt_put(rt);
1452 goto do_next;
1453 }
1454 rcu_read_unlock();
1455 do_next:
1456 ;
1457 }
1458 }
1459 in_dev_put(in_dev);
1460 return;
1461
1462reject_redirect:
1463#ifdef CONFIG_IP_ROUTE_VERBOSE
1464 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1465 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1466 " Advised path = %pI4 -> %pI4\n",
1467 &old_gw, dev->name, &new_gw,
1468 &saddr, &daddr);
1469#endif
1470 in_dev_put(in_dev);
1471}
1472
1473static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1474{
1475 struct rtable *rt = (struct rtable *)dst;
1476 struct dst_entry *ret = dst;
1477
1478 if (rt) {
1479 if (dst->obsolete) {
1480 ip_rt_put(rt);
1481 ret = NULL;
1482 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1483 rt->u.dst.expires) {
1484 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1485 rt->fl.oif,
1486 rt_genid(dev_net(dst->dev)));
1487#if RT_CACHE_DEBUG >= 1
1488 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1489 &rt->rt_dst, rt->fl.fl4_tos);
1490#endif
1491 rt_del(hash, rt);
1492 ret = NULL;
1493 }
1494 }
1495 return ret;
1496}
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514void ip_rt_send_redirect(struct sk_buff *skb)
1515{
1516 struct rtable *rt = skb_rtable(skb);
1517 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1518
1519 if (!in_dev)
1520 return;
1521
1522 if (!IN_DEV_TX_REDIRECTS(in_dev))
1523 goto out;
1524
1525
1526
1527
1528 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1529 rt->u.dst.rate_tokens = 0;
1530
1531
1532
1533
1534 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1535 rt->u.dst.rate_last = jiffies;
1536 goto out;
1537 }
1538
1539
1540
1541
1542 if (rt->u.dst.rate_tokens == 0 ||
1543 time_after(jiffies,
1544 (rt->u.dst.rate_last +
1545 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1546 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1547 rt->u.dst.rate_last = jiffies;
1548 ++rt->u.dst.rate_tokens;
1549#ifdef CONFIG_IP_ROUTE_VERBOSE
1550 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1551 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1552 net_ratelimit())
1553 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1554 &rt->rt_src, rt->rt_iif,
1555 &rt->rt_dst, &rt->rt_gateway);
1556#endif
1557 }
1558out:
1559 in_dev_put(in_dev);
1560}
1561
1562static int ip_error(struct sk_buff *skb)
1563{
1564 struct rtable *rt = skb_rtable(skb);
1565 unsigned long now;
1566 int code;
1567
1568 switch (rt->u.dst.error) {
1569 case EINVAL:
1570 default:
1571 goto out;
1572 case EHOSTUNREACH:
1573 code = ICMP_HOST_UNREACH;
1574 break;
1575 case ENETUNREACH:
1576 code = ICMP_NET_UNREACH;
1577 IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1578 IPSTATS_MIB_INNOROUTES);
1579 break;
1580 case EACCES:
1581 code = ICMP_PKT_FILTERED;
1582 break;
1583 }
1584
1585 now = jiffies;
1586 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1587 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1588 rt->u.dst.rate_tokens = ip_rt_error_burst;
1589 rt->u.dst.rate_last = now;
1590 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1591 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1592 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1593 }
1594
1595out: kfree_skb(skb);
1596 return 0;
1597}
1598
1599
1600
1601
1602
1603
1604static const unsigned short mtu_plateau[] =
1605{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1606
1607static inline unsigned short guess_mtu(unsigned short old_mtu)
1608{
1609 int i;
1610
1611 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1612 if (old_mtu > mtu_plateau[i])
1613 return mtu_plateau[i];
1614 return 68;
1615}
1616
1617unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1618 unsigned short new_mtu,
1619 struct net_device *dev)
1620{
1621 int i, k;
1622 unsigned short old_mtu = ntohs(iph->tot_len);
1623 struct rtable *rth;
1624 int ikeys[2] = { dev->ifindex, 0 };
1625 __be32 skeys[2] = { iph->saddr, 0, };
1626 __be32 daddr = iph->daddr;
1627 unsigned short est_mtu = 0;
1628
1629 if (ipv4_config.no_pmtu_disc)
1630 return 0;
1631
1632 for (k = 0; k < 2; k++) {
1633 for (i = 0; i < 2; i++) {
1634 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1635 rt_genid(net));
1636
1637 rcu_read_lock();
1638 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1639 rth = rcu_dereference(rth->u.dst.rt_next)) {
1640 unsigned short mtu = new_mtu;
1641
1642 if (rth->fl.fl4_dst != daddr ||
1643 rth->fl.fl4_src != skeys[i] ||
1644 rth->rt_dst != daddr ||
1645 rth->rt_src != iph->saddr ||
1646 rth->fl.oif != ikeys[k] ||
1647 rth->fl.iif != 0 ||
1648 dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1649 !net_eq(dev_net(rth->u.dst.dev), net) ||
1650 rt_is_expired(rth))
1651 continue;
1652
1653 if (new_mtu < 68 || new_mtu >= old_mtu) {
1654
1655
1656 if (mtu == 0 &&
1657 old_mtu >= dst_mtu(&rth->u.dst) &&
1658 old_mtu >= 68 + (iph->ihl << 2))
1659 old_mtu -= iph->ihl << 2;
1660
1661 mtu = guess_mtu(old_mtu);
1662 }
1663 if (mtu <= dst_mtu(&rth->u.dst)) {
1664 if (mtu < dst_mtu(&rth->u.dst)) {
1665 dst_confirm(&rth->u.dst);
1666 if (mtu < ip_rt_min_pmtu) {
1667 mtu = ip_rt_min_pmtu;
1668 rth->u.dst.metrics[RTAX_LOCK-1] |=
1669 (1 << RTAX_MTU);
1670 }
1671 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1672 dst_set_expires(&rth->u.dst,
1673 ip_rt_mtu_expires);
1674 }
1675 est_mtu = mtu;
1676 }
1677 }
1678 rcu_read_unlock();
1679 }
1680 }
1681 return est_mtu ? : new_mtu;
1682}
1683
1684static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1685{
1686 if (dst_mtu(dst) > mtu && mtu >= 68 &&
1687 !(dst_metric_locked(dst, RTAX_MTU))) {
1688 if (mtu < ip_rt_min_pmtu) {
1689 mtu = ip_rt_min_pmtu;
1690 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1691 }
1692 dst->metrics[RTAX_MTU-1] = mtu;
1693 dst_set_expires(dst, ip_rt_mtu_expires);
1694 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1695 }
1696}
1697
1698static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1699{
1700 return NULL;
1701}
1702
1703static void ipv4_dst_destroy(struct dst_entry *dst)
1704{
1705 struct rtable *rt = (struct rtable *) dst;
1706 struct inet_peer *peer = rt->peer;
1707 struct in_device *idev = rt->idev;
1708
1709 if (peer) {
1710 rt->peer = NULL;
1711 inet_putpeer(peer);
1712 }
1713
1714 if (idev) {
1715 rt->idev = NULL;
1716 in_dev_put(idev);
1717 }
1718}
1719
1720static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1721 int how)
1722{
1723 struct rtable *rt = (struct rtable *) dst;
1724 struct in_device *idev = rt->idev;
1725 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1726 struct in_device *loopback_idev =
1727 in_dev_get(dev_net(dev)->loopback_dev);
1728 if (loopback_idev) {
1729 rt->idev = loopback_idev;
1730 in_dev_put(idev);
1731 }
1732 }
1733}
1734
1735static void ipv4_link_failure(struct sk_buff *skb)
1736{
1737 struct rtable *rt;
1738
1739 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1740
1741 rt = skb_rtable(skb);
1742 if (rt)
1743 dst_set_expires(&rt->u.dst, 0);
1744}
1745
1746static int ip_rt_bug(struct sk_buff *skb)
1747{
1748 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1749 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1750 skb->dev ? skb->dev->name : "?");
1751 kfree_skb(skb);
1752 return 0;
1753}
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764void ip_rt_get_source(u8 *addr, struct rtable *rt)
1765{
1766 __be32 src;
1767 struct fib_result res;
1768
1769 if (rt->fl.iif == 0)
1770 src = rt->rt_src;
1771 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1772 src = FIB_RES_PREFSRC(res);
1773 fib_res_put(&res);
1774 } else
1775 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1776 RT_SCOPE_UNIVERSE);
1777 memcpy(addr, &src, 4);
1778}
1779
1780#ifdef CONFIG_NET_CLS_ROUTE
1781static void set_class_tag(struct rtable *rt, u32 tag)
1782{
1783 if (!(rt->u.dst.tclassid & 0xFFFF))
1784 rt->u.dst.tclassid |= tag & 0xFFFF;
1785 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1786 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1787}
1788#endif
1789
1790static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1791{
1792 struct fib_info *fi = res->fi;
1793
1794 if (fi) {
1795 if (FIB_RES_GW(*res) &&
1796 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1797 rt->rt_gateway = FIB_RES_GW(*res);
1798 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1799 sizeof(rt->u.dst.metrics));
1800 if (fi->fib_mtu == 0) {
1801 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1802 if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1803 rt->rt_gateway != rt->rt_dst &&
1804 rt->u.dst.dev->mtu > 576)
1805 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1806 }
1807#ifdef CONFIG_NET_CLS_ROUTE
1808 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1809#endif
1810 } else
1811 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1812
1813 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1814 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1815 if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1816 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1817 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1818 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1819 ip_rt_min_advmss);
1820 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1821 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1822
1823#ifdef CONFIG_NET_CLS_ROUTE
1824#ifdef CONFIG_IP_MULTIPLE_TABLES
1825 set_class_tag(rt, fib_rules_tclass(res));
1826#endif
1827 set_class_tag(rt, itag);
1828#endif
1829 rt->rt_type = res->type;
1830}
1831
1832static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1833 u8 tos, struct net_device *dev, int our)
1834{
1835 unsigned hash;
1836 struct rtable *rth;
1837 __be32 spec_dst;
1838 struct in_device *in_dev = in_dev_get(dev);
1839 u32 itag = 0;
1840
1841
1842
1843 if (in_dev == NULL)
1844 return -EINVAL;
1845
1846 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1847 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1848 goto e_inval;
1849
1850 if (ipv4_is_zeronet(saddr)) {
1851 if (!ipv4_is_local_multicast(daddr))
1852 goto e_inval;
1853 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1854 } else if (fib_validate_source(saddr, 0, tos, 0,
1855 dev, &spec_dst, &itag) < 0)
1856 goto e_inval;
1857
1858 rth = dst_alloc(&ipv4_dst_ops);
1859 if (!rth)
1860 goto e_nobufs;
1861
1862 rth->u.dst.output= ip_rt_bug;
1863
1864 atomic_set(&rth->u.dst.__refcnt, 1);
1865 rth->u.dst.flags= DST_HOST;
1866 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1867 rth->u.dst.flags |= DST_NOPOLICY;
1868 rth->fl.fl4_dst = daddr;
1869 rth->rt_dst = daddr;
1870 rth->fl.fl4_tos = tos;
1871 rth->fl.mark = skb->mark;
1872 rth->fl.fl4_src = saddr;
1873 rth->rt_src = saddr;
1874#ifdef CONFIG_NET_CLS_ROUTE
1875 rth->u.dst.tclassid = itag;
1876#endif
1877 rth->rt_iif =
1878 rth->fl.iif = dev->ifindex;
1879 rth->u.dst.dev = init_net.loopback_dev;
1880 dev_hold(rth->u.dst.dev);
1881 rth->idev = in_dev_get(rth->u.dst.dev);
1882 rth->fl.oif = 0;
1883 rth->rt_gateway = daddr;
1884 rth->rt_spec_dst= spec_dst;
1885 rth->rt_genid = rt_genid(dev_net(dev));
1886 rth->rt_flags = RTCF_MULTICAST;
1887 rth->rt_type = RTN_MULTICAST;
1888 if (our) {
1889 rth->u.dst.input= ip_local_deliver;
1890 rth->rt_flags |= RTCF_LOCAL;
1891 }
1892
1893#ifdef CONFIG_IP_MROUTE
1894 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1895 rth->u.dst.input = ip_mr_input;
1896#endif
1897 RT_CACHE_STAT_INC(in_slow_mc);
1898
1899 in_dev_put(in_dev);
1900 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1901 return rt_intern_hash(hash, rth, NULL, skb);
1902
1903e_nobufs:
1904 in_dev_put(in_dev);
1905 return -ENOBUFS;
1906
1907e_inval:
1908 in_dev_put(in_dev);
1909 return -EINVAL;
1910}
1911
1912
1913static void ip_handle_martian_source(struct net_device *dev,
1914 struct in_device *in_dev,
1915 struct sk_buff *skb,
1916 __be32 daddr,
1917 __be32 saddr)
1918{
1919 RT_CACHE_STAT_INC(in_martian_src);
1920#ifdef CONFIG_IP_ROUTE_VERBOSE
1921 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1922
1923
1924
1925
1926 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1927 &daddr, &saddr, dev->name);
1928 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1929 int i;
1930 const unsigned char *p = skb_mac_header(skb);
1931 printk(KERN_WARNING "ll header: ");
1932 for (i = 0; i < dev->hard_header_len; i++, p++) {
1933 printk("%02x", *p);
1934 if (i < (dev->hard_header_len - 1))
1935 printk(":");
1936 }
1937 printk("\n");
1938 }
1939 }
1940#endif
1941}
1942
1943static int __mkroute_input(struct sk_buff *skb,
1944 struct fib_result *res,
1945 struct in_device *in_dev,
1946 __be32 daddr, __be32 saddr, u32 tos,
1947 struct rtable **result)
1948{
1949
1950 struct rtable *rth;
1951 int err;
1952 struct in_device *out_dev;
1953 unsigned flags = 0;
1954 __be32 spec_dst;
1955 u32 itag;
1956
1957
1958 out_dev = in_dev_get(FIB_RES_DEV(*res));
1959 if (out_dev == NULL) {
1960 if (net_ratelimit())
1961 printk(KERN_CRIT "Bug in ip_route_input" \
1962 "_slow(). Please, report\n");
1963 return -EINVAL;
1964 }
1965
1966
1967 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1968 in_dev->dev, &spec_dst, &itag);
1969 if (err < 0) {
1970 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1971 saddr);
1972
1973 err = -EINVAL;
1974 goto cleanup;
1975 }
1976
1977 if (err)
1978 flags |= RTCF_DIRECTSRC;
1979
1980 if (out_dev == in_dev && err &&
1981 (IN_DEV_SHARED_MEDIA(out_dev) ||
1982 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1983 flags |= RTCF_DOREDIRECT;
1984
1985 if (skb->protocol != htons(ETH_P_IP)) {
1986
1987
1988
1989 if (out_dev == in_dev) {
1990 err = -EINVAL;
1991 goto cleanup;
1992 }
1993 }
1994
1995
1996 rth = dst_alloc(&ipv4_dst_ops);
1997 if (!rth) {
1998 err = -ENOBUFS;
1999 goto cleanup;
2000 }
2001
2002 atomic_set(&rth->u.dst.__refcnt, 1);
2003 rth->u.dst.flags= DST_HOST;
2004 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2005 rth->u.dst.flags |= DST_NOPOLICY;
2006 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2007 rth->u.dst.flags |= DST_NOXFRM;
2008 rth->fl.fl4_dst = daddr;
2009 rth->rt_dst = daddr;
2010 rth->fl.fl4_tos = tos;
2011 rth->fl.mark = skb->mark;
2012 rth->fl.fl4_src = saddr;
2013 rth->rt_src = saddr;
2014 rth->rt_gateway = daddr;
2015 rth->rt_iif =
2016 rth->fl.iif = in_dev->dev->ifindex;
2017 rth->u.dst.dev = (out_dev)->dev;
2018 dev_hold(rth->u.dst.dev);
2019 rth->idev = in_dev_get(rth->u.dst.dev);
2020 rth->fl.oif = 0;
2021 rth->rt_spec_dst= spec_dst;
2022
2023 rth->u.dst.input = ip_forward;
2024 rth->u.dst.output = ip_output;
2025 rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
2026
2027 rt_set_nexthop(rth, res, itag);
2028
2029 rth->rt_flags = flags;
2030
2031 *result = rth;
2032 err = 0;
2033 cleanup:
2034
2035 in_dev_put(out_dev);
2036 return err;
2037}
2038
2039static int ip_mkroute_input(struct sk_buff *skb,
2040 struct fib_result *res,
2041 const struct flowi *fl,
2042 struct in_device *in_dev,
2043 __be32 daddr, __be32 saddr, u32 tos)
2044{
2045 struct rtable* rth = NULL;
2046 int err;
2047 unsigned hash;
2048
2049#ifdef CONFIG_IP_ROUTE_MULTIPATH
2050 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2051 fib_select_multipath(fl, res);
2052#endif
2053
2054
2055 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2056 if (err)
2057 return err;
2058
2059
2060 hash = rt_hash(daddr, saddr, fl->iif,
2061 rt_genid(dev_net(rth->u.dst.dev)));
2062 return rt_intern_hash(hash, rth, NULL, skb);
2063}
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2076 u8 tos, struct net_device *dev)
2077{
2078 struct fib_result res;
2079 struct in_device *in_dev = in_dev_get(dev);
2080 struct flowi fl = { .nl_u = { .ip4_u =
2081 { .daddr = daddr,
2082 .saddr = saddr,
2083 .tos = tos,
2084 .scope = RT_SCOPE_UNIVERSE,
2085 } },
2086 .mark = skb->mark,
2087 .iif = dev->ifindex };
2088 unsigned flags = 0;
2089 u32 itag = 0;
2090 struct rtable * rth;
2091 unsigned hash;
2092 __be32 spec_dst;
2093 int err = -EINVAL;
2094 int free_res = 0;
2095 struct net * net = dev_net(dev);
2096
2097
2098
2099 if (!in_dev)
2100 goto out;
2101
2102
2103
2104
2105
2106 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2107 ipv4_is_loopback(saddr))
2108 goto martian_source;
2109
2110 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2111 goto brd_input;
2112
2113
2114
2115
2116 if (ipv4_is_zeronet(saddr))
2117 goto martian_source;
2118
2119 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2120 ipv4_is_loopback(daddr))
2121 goto martian_destination;
2122
2123
2124
2125
2126 if ((err = fib_lookup(net, &fl, &res)) != 0) {
2127 if (!IN_DEV_FORWARD(in_dev))
2128 goto e_hostunreach;
2129 goto no_route;
2130 }
2131 free_res = 1;
2132
2133 RT_CACHE_STAT_INC(in_slow_tot);
2134
2135 if (res.type == RTN_BROADCAST)
2136 goto brd_input;
2137
2138 if (res.type == RTN_LOCAL) {
2139 int result;
2140 result = fib_validate_source(saddr, daddr, tos,
2141 net->loopback_dev->ifindex,
2142 dev, &spec_dst, &itag);
2143 if (result < 0)
2144 goto martian_source;
2145 if (result)
2146 flags |= RTCF_DIRECTSRC;
2147 spec_dst = daddr;
2148 goto local_input;
2149 }
2150
2151 if (!IN_DEV_FORWARD(in_dev))
2152 goto e_hostunreach;
2153 if (res.type != RTN_UNICAST)
2154 goto martian_destination;
2155
2156 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2157done:
2158 in_dev_put(in_dev);
2159 if (free_res)
2160 fib_res_put(&res);
2161out: return err;
2162
2163brd_input:
2164 if (skb->protocol != htons(ETH_P_IP))
2165 goto e_inval;
2166
2167 if (ipv4_is_zeronet(saddr))
2168 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2169 else {
2170 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2171 &itag);
2172 if (err < 0)
2173 goto martian_source;
2174 if (err)
2175 flags |= RTCF_DIRECTSRC;
2176 }
2177 flags |= RTCF_BROADCAST;
2178 res.type = RTN_BROADCAST;
2179 RT_CACHE_STAT_INC(in_brd);
2180
2181local_input:
2182 rth = dst_alloc(&ipv4_dst_ops);
2183 if (!rth)
2184 goto e_nobufs;
2185
2186 rth->u.dst.output= ip_rt_bug;
2187 rth->rt_genid = rt_genid(net);
2188
2189 atomic_set(&rth->u.dst.__refcnt, 1);
2190 rth->u.dst.flags= DST_HOST;
2191 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2192 rth->u.dst.flags |= DST_NOPOLICY;
2193 rth->fl.fl4_dst = daddr;
2194 rth->rt_dst = daddr;
2195 rth->fl.fl4_tos = tos;
2196 rth->fl.mark = skb->mark;
2197 rth->fl.fl4_src = saddr;
2198 rth->rt_src = saddr;
2199#ifdef CONFIG_NET_CLS_ROUTE
2200 rth->u.dst.tclassid = itag;
2201#endif
2202 rth->rt_iif =
2203 rth->fl.iif = dev->ifindex;
2204 rth->u.dst.dev = net->loopback_dev;
2205 dev_hold(rth->u.dst.dev);
2206 rth->idev = in_dev_get(rth->u.dst.dev);
2207 rth->rt_gateway = daddr;
2208 rth->rt_spec_dst= spec_dst;
2209 rth->u.dst.input= ip_local_deliver;
2210 rth->rt_flags = flags|RTCF_LOCAL;
2211 if (res.type == RTN_UNREACHABLE) {
2212 rth->u.dst.input= ip_error;
2213 rth->u.dst.error= -err;
2214 rth->rt_flags &= ~RTCF_LOCAL;
2215 }
2216 rth->rt_type = res.type;
2217 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2218 err = rt_intern_hash(hash, rth, NULL, skb);
2219 goto done;
2220
2221no_route:
2222 RT_CACHE_STAT_INC(in_no_route);
2223 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2224 res.type = RTN_UNREACHABLE;
2225 if (err == -ESRCH)
2226 err = -ENETUNREACH;
2227 goto local_input;
2228
2229
2230
2231
2232martian_destination:
2233 RT_CACHE_STAT_INC(in_martian_dst);
2234#ifdef CONFIG_IP_ROUTE_VERBOSE
2235 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2236 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2237 &daddr, &saddr, dev->name);
2238#endif
2239
2240e_hostunreach:
2241 err = -EHOSTUNREACH;
2242 goto done;
2243
2244e_inval:
2245 err = -EINVAL;
2246 goto done;
2247
2248e_nobufs:
2249 err = -ENOBUFS;
2250 goto done;
2251
2252martian_source:
2253 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2254 goto e_inval;
2255}
2256
2257int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2258 u8 tos, struct net_device *dev)
2259{
2260 struct rtable * rth;
2261 unsigned hash;
2262 int iif = dev->ifindex;
2263 struct net *net;
2264
2265 net = dev_net(dev);
2266
2267 if (!rt_caching(net))
2268 goto skip_cache;
2269
2270 tos &= IPTOS_RT_MASK;
2271 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2272
2273 rcu_read_lock();
2274 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2275 rth = rcu_dereference(rth->u.dst.rt_next)) {
2276 if (((rth->fl.fl4_dst ^ daddr) |
2277 (rth->fl.fl4_src ^ saddr) |
2278 (rth->fl.iif ^ iif) |
2279 rth->fl.oif |
2280 (rth->fl.fl4_tos ^ tos)) == 0 &&
2281 rth->fl.mark == skb->mark &&
2282 net_eq(dev_net(rth->u.dst.dev), net) &&
2283 !rt_is_expired(rth)) {
2284 dst_use(&rth->u.dst, jiffies);
2285 RT_CACHE_STAT_INC(in_hit);
2286 rcu_read_unlock();
2287 skb_dst_set(skb, &rth->u.dst);
2288 return 0;
2289 }
2290 RT_CACHE_STAT_INC(in_hlist_search);
2291 }
2292 rcu_read_unlock();
2293
2294skip_cache:
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306 if (ipv4_is_multicast(daddr)) {
2307 struct in_device *in_dev;
2308
2309 rcu_read_lock();
2310 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2311 int our = ip_check_mc(in_dev, daddr, saddr,
2312 ip_hdr(skb)->protocol);
2313 if (our
2314#ifdef CONFIG_IP_MROUTE
2315 || (!ipv4_is_local_multicast(daddr) &&
2316 IN_DEV_MFORWARD(in_dev))
2317#endif
2318 ) {
2319 rcu_read_unlock();
2320 return ip_route_input_mc(skb, daddr, saddr,
2321 tos, dev, our);
2322 }
2323 }
2324 rcu_read_unlock();
2325 return -EINVAL;
2326 }
2327 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2328}
2329
2330static int __mkroute_output(struct rtable **result,
2331 struct fib_result *res,
2332 const struct flowi *fl,
2333 const struct flowi *oldflp,
2334 struct net_device *dev_out,
2335 unsigned flags)
2336{
2337 struct rtable *rth;
2338 struct in_device *in_dev;
2339 u32 tos = RT_FL_TOS(oldflp);
2340 int err = 0;
2341
2342 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2343 return -EINVAL;
2344
2345 if (fl->fl4_dst == htonl(0xFFFFFFFF))
2346 res->type = RTN_BROADCAST;
2347 else if (ipv4_is_multicast(fl->fl4_dst))
2348 res->type = RTN_MULTICAST;
2349 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2350 return -EINVAL;
2351
2352 if (dev_out->flags & IFF_LOOPBACK)
2353 flags |= RTCF_LOCAL;
2354
2355
2356 in_dev = in_dev_get(dev_out);
2357 if (!in_dev)
2358 return -EINVAL;
2359
2360 if (res->type == RTN_BROADCAST) {
2361 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2362 if (res->fi) {
2363 fib_info_put(res->fi);
2364 res->fi = NULL;
2365 }
2366 } else if (res->type == RTN_MULTICAST) {
2367 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2368 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2369 oldflp->proto))
2370 flags &= ~RTCF_LOCAL;
2371
2372
2373
2374
2375 if (res->fi && res->prefixlen < 4) {
2376 fib_info_put(res->fi);
2377 res->fi = NULL;
2378 }
2379 }
2380
2381
2382 rth = dst_alloc(&ipv4_dst_ops);
2383 if (!rth) {
2384 err = -ENOBUFS;
2385 goto cleanup;
2386 }
2387
2388 atomic_set(&rth->u.dst.__refcnt, 1);
2389 rth->u.dst.flags= DST_HOST;
2390 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2391 rth->u.dst.flags |= DST_NOXFRM;
2392 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2393 rth->u.dst.flags |= DST_NOPOLICY;
2394
2395 rth->fl.fl4_dst = oldflp->fl4_dst;
2396 rth->fl.fl4_tos = tos;
2397 rth->fl.fl4_src = oldflp->fl4_src;
2398 rth->fl.oif = oldflp->oif;
2399 rth->fl.mark = oldflp->mark;
2400 rth->rt_dst = fl->fl4_dst;
2401 rth->rt_src = fl->fl4_src;
2402 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2403
2404
2405 rth->u.dst.dev = dev_out;
2406 dev_hold(dev_out);
2407 rth->idev = in_dev_get(dev_out);
2408 rth->rt_gateway = fl->fl4_dst;
2409 rth->rt_spec_dst= fl->fl4_src;
2410
2411 rth->u.dst.output=ip_output;
2412 rth->rt_genid = rt_genid(dev_net(dev_out));
2413
2414 RT_CACHE_STAT_INC(out_slow_tot);
2415
2416 if (flags & RTCF_LOCAL) {
2417 rth->u.dst.input = ip_local_deliver;
2418 rth->rt_spec_dst = fl->fl4_dst;
2419 }
2420 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2421 rth->rt_spec_dst = fl->fl4_src;
2422 if (flags & RTCF_LOCAL &&
2423 !(dev_out->flags & IFF_LOOPBACK)) {
2424 rth->u.dst.output = ip_mc_output;
2425 RT_CACHE_STAT_INC(out_slow_mc);
2426 }
2427#ifdef CONFIG_IP_MROUTE
2428 if (res->type == RTN_MULTICAST) {
2429 if (IN_DEV_MFORWARD(in_dev) &&
2430 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2431 rth->u.dst.input = ip_mr_input;
2432 rth->u.dst.output = ip_mc_output;
2433 }
2434 }
2435#endif
2436 }
2437
2438 rt_set_nexthop(rth, res, 0);
2439
2440 rth->rt_flags = flags;
2441
2442 *result = rth;
2443 cleanup:
2444
2445 in_dev_put(in_dev);
2446
2447 return err;
2448}
2449
2450static int ip_mkroute_output(struct rtable **rp,
2451 struct fib_result *res,
2452 const struct flowi *fl,
2453 const struct flowi *oldflp,
2454 struct net_device *dev_out,
2455 unsigned flags)
2456{
2457 struct rtable *rth = NULL;
2458 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2459 unsigned hash;
2460 if (err == 0) {
2461 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2462 rt_genid(dev_net(dev_out)));
2463 err = rt_intern_hash(hash, rth, rp, NULL);
2464 }
2465
2466 return err;
2467}
2468
2469
2470
2471
2472
2473static int ip_route_output_slow(struct net *net, struct rtable **rp,
2474 const struct flowi *oldflp)
2475{
2476 u32 tos = RT_FL_TOS(oldflp);
2477 struct flowi fl = { .nl_u = { .ip4_u =
2478 { .daddr = oldflp->fl4_dst,
2479 .saddr = oldflp->fl4_src,
2480 .tos = tos & IPTOS_RT_MASK,
2481 .scope = ((tos & RTO_ONLINK) ?
2482 RT_SCOPE_LINK :
2483 RT_SCOPE_UNIVERSE),
2484 } },
2485 .mark = oldflp->mark,
2486 .iif = net->loopback_dev->ifindex,
2487 .oif = oldflp->oif };
2488 struct fib_result res;
2489 unsigned flags = 0;
2490 struct net_device *dev_out = NULL;
2491 int free_res = 0;
2492 int err;
2493
2494
2495 res.fi = NULL;
2496#ifdef CONFIG_IP_MULTIPLE_TABLES
2497 res.r = NULL;
2498#endif
2499
2500 if (oldflp->fl4_src) {
2501 err = -EINVAL;
2502 if (ipv4_is_multicast(oldflp->fl4_src) ||
2503 ipv4_is_lbcast(oldflp->fl4_src) ||
2504 ipv4_is_zeronet(oldflp->fl4_src))
2505 goto out;
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515 if (oldflp->oif == 0
2516 && (ipv4_is_multicast(oldflp->fl4_dst) ||
2517 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2518
2519 dev_out = ip_dev_find(net, oldflp->fl4_src);
2520 if (dev_out == NULL)
2521 goto out;
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538 fl.oif = dev_out->ifindex;
2539 goto make_route;
2540 }
2541
2542 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2543
2544 dev_out = ip_dev_find(net, oldflp->fl4_src);
2545 if (dev_out == NULL)
2546 goto out;
2547 dev_put(dev_out);
2548 dev_out = NULL;
2549 }
2550 }
2551
2552
2553 if (oldflp->oif) {
2554 dev_out = dev_get_by_index(net, oldflp->oif);
2555 err = -ENODEV;
2556 if (dev_out == NULL)
2557 goto out;
2558
2559
2560 if (__in_dev_get_rtnl(dev_out) == NULL) {
2561 dev_put(dev_out);
2562 goto out;
2563 }
2564
2565 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2566 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2567 if (!fl.fl4_src)
2568 fl.fl4_src = inet_select_addr(dev_out, 0,
2569 RT_SCOPE_LINK);
2570 goto make_route;
2571 }
2572 if (!fl.fl4_src) {
2573 if (ipv4_is_multicast(oldflp->fl4_dst))
2574 fl.fl4_src = inet_select_addr(dev_out, 0,
2575 fl.fl4_scope);
2576 else if (!oldflp->fl4_dst)
2577 fl.fl4_src = inet_select_addr(dev_out, 0,
2578 RT_SCOPE_HOST);
2579 }
2580 }
2581
2582 if (!fl.fl4_dst) {
2583 fl.fl4_dst = fl.fl4_src;
2584 if (!fl.fl4_dst)
2585 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2586 if (dev_out)
2587 dev_put(dev_out);
2588 dev_out = net->loopback_dev;
2589 dev_hold(dev_out);
2590 fl.oif = net->loopback_dev->ifindex;
2591 res.type = RTN_LOCAL;
2592 flags |= RTCF_LOCAL;
2593 goto make_route;
2594 }
2595
2596 if (fib_lookup(net, &fl, &res)) {
2597 res.fi = NULL;
2598 if (oldflp->oif) {
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617 if (fl.fl4_src == 0)
2618 fl.fl4_src = inet_select_addr(dev_out, 0,
2619 RT_SCOPE_LINK);
2620 res.type = RTN_UNICAST;
2621 goto make_route;
2622 }
2623 if (dev_out)
2624 dev_put(dev_out);
2625 err = -ENETUNREACH;
2626 goto out;
2627 }
2628 free_res = 1;
2629
2630 if (res.type == RTN_LOCAL) {
2631 if (!fl.fl4_src)
2632 fl.fl4_src = fl.fl4_dst;
2633 if (dev_out)
2634 dev_put(dev_out);
2635 dev_out = net->loopback_dev;
2636 dev_hold(dev_out);
2637 fl.oif = dev_out->ifindex;
2638 if (res.fi)
2639 fib_info_put(res.fi);
2640 res.fi = NULL;
2641 flags |= RTCF_LOCAL;
2642 goto make_route;
2643 }
2644
2645#ifdef CONFIG_IP_ROUTE_MULTIPATH
2646 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2647 fib_select_multipath(&fl, &res);
2648 else
2649#endif
2650 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2651 fib_select_default(net, &fl, &res);
2652
2653 if (!fl.fl4_src)
2654 fl.fl4_src = FIB_RES_PREFSRC(res);
2655
2656 if (dev_out)
2657 dev_put(dev_out);
2658 dev_out = FIB_RES_DEV(res);
2659 dev_hold(dev_out);
2660 fl.oif = dev_out->ifindex;
2661
2662
2663make_route:
2664 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2665
2666
2667 if (free_res)
2668 fib_res_put(&res);
2669 if (dev_out)
2670 dev_put(dev_out);
2671out: return err;
2672}
2673
2674int __ip_route_output_key(struct net *net, struct rtable **rp,
2675 const struct flowi *flp)
2676{
2677 unsigned hash;
2678 struct rtable *rth;
2679
2680 if (!rt_caching(net))
2681 goto slow_output;
2682
2683 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2684
2685 rcu_read_lock_bh();
2686 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2687 rth = rcu_dereference(rth->u.dst.rt_next)) {
2688 if (rth->fl.fl4_dst == flp->fl4_dst &&
2689 rth->fl.fl4_src == flp->fl4_src &&
2690 rth->fl.iif == 0 &&
2691 rth->fl.oif == flp->oif &&
2692 rth->fl.mark == flp->mark &&
2693 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2694 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2695 net_eq(dev_net(rth->u.dst.dev), net) &&
2696 !rt_is_expired(rth)) {
2697 dst_use(&rth->u.dst, jiffies);
2698 RT_CACHE_STAT_INC(out_hit);
2699 rcu_read_unlock_bh();
2700 *rp = rth;
2701 return 0;
2702 }
2703 RT_CACHE_STAT_INC(out_hlist_search);
2704 }
2705 rcu_read_unlock_bh();
2706
2707slow_output:
2708 return ip_route_output_slow(net, rp, flp);
2709}
2710
2711EXPORT_SYMBOL_GPL(__ip_route_output_key);
2712
2713static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2714{
2715}
2716
2717static struct dst_ops ipv4_dst_blackhole_ops = {
2718 .family = AF_INET,
2719 .protocol = cpu_to_be16(ETH_P_IP),
2720 .destroy = ipv4_dst_destroy,
2721 .check = ipv4_dst_check,
2722 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2723 .entries = ATOMIC_INIT(0),
2724};
2725
2726
2727static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2728{
2729 struct rtable *ort = *rp;
2730 struct rtable *rt = (struct rtable *)
2731 dst_alloc(&ipv4_dst_blackhole_ops);
2732
2733 if (rt) {
2734 struct dst_entry *new = &rt->u.dst;
2735
2736 atomic_set(&new->__refcnt, 1);
2737 new->__use = 1;
2738 new->input = dst_discard;
2739 new->output = dst_discard;
2740 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2741
2742 new->dev = ort->u.dst.dev;
2743 if (new->dev)
2744 dev_hold(new->dev);
2745
2746 rt->fl = ort->fl;
2747
2748 rt->idev = ort->idev;
2749 if (rt->idev)
2750 in_dev_hold(rt->idev);
2751 rt->rt_genid = rt_genid(net);
2752 rt->rt_flags = ort->rt_flags;
2753 rt->rt_type = ort->rt_type;
2754 rt->rt_dst = ort->rt_dst;
2755 rt->rt_src = ort->rt_src;
2756 rt->rt_iif = ort->rt_iif;
2757 rt->rt_gateway = ort->rt_gateway;
2758 rt->rt_spec_dst = ort->rt_spec_dst;
2759 rt->peer = ort->peer;
2760 if (rt->peer)
2761 atomic_inc(&rt->peer->refcnt);
2762
2763 dst_free(new);
2764 }
2765
2766 dst_release(&(*rp)->u.dst);
2767 *rp = rt;
2768 return (rt ? 0 : -ENOMEM);
2769}
2770
2771int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2772 struct sock *sk, int flags)
2773{
2774 int err;
2775
2776 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2777 return err;
2778
2779 if (flp->proto) {
2780 if (!flp->fl4_src)
2781 flp->fl4_src = (*rp)->rt_src;
2782 if (!flp->fl4_dst)
2783 flp->fl4_dst = (*rp)->rt_dst;
2784 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2785 flags ? XFRM_LOOKUP_WAIT : 0);
2786 if (err == -EREMOTE)
2787 err = ipv4_dst_blackhole(net, rp, flp);
2788
2789 return err;
2790 }
2791
2792 return 0;
2793}
2794
2795EXPORT_SYMBOL_GPL(ip_route_output_flow);
2796
2797int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2798{
2799 return ip_route_output_flow(net, rp, flp, NULL, 0);
2800}
2801
2802static int rt_fill_info(struct net *net,
2803 struct sk_buff *skb, u32 pid, u32 seq, int event,
2804 int nowait, unsigned int flags)
2805{
2806 struct rtable *rt = skb_rtable(skb);
2807 struct rtmsg *r;
2808 struct nlmsghdr *nlh;
2809 long expires;
2810 u32 id = 0, ts = 0, tsage = 0, error;
2811
2812 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2813 if (nlh == NULL)
2814 return -EMSGSIZE;
2815
2816 r = nlmsg_data(nlh);
2817 r->rtm_family = AF_INET;
2818 r->rtm_dst_len = 32;
2819 r->rtm_src_len = 0;
2820 r->rtm_tos = rt->fl.fl4_tos;
2821 r->rtm_table = RT_TABLE_MAIN;
2822 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2823 r->rtm_type = rt->rt_type;
2824 r->rtm_scope = RT_SCOPE_UNIVERSE;
2825 r->rtm_protocol = RTPROT_UNSPEC;
2826 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2827 if (rt->rt_flags & RTCF_NOTIFY)
2828 r->rtm_flags |= RTM_F_NOTIFY;
2829
2830 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2831
2832 if (rt->fl.fl4_src) {
2833 r->rtm_src_len = 32;
2834 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2835 }
2836 if (rt->u.dst.dev)
2837 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2838#ifdef CONFIG_NET_CLS_ROUTE
2839 if (rt->u.dst.tclassid)
2840 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2841#endif
2842 if (rt->fl.iif)
2843 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2844 else if (rt->rt_src != rt->fl.fl4_src)
2845 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2846
2847 if (rt->rt_dst != rt->rt_gateway)
2848 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2849
2850 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2851 goto nla_put_failure;
2852
2853 error = rt->u.dst.error;
2854 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2855 if (rt->peer) {
2856 id = rt->peer->ip_id_count;
2857 if (rt->peer->tcp_ts_stamp) {
2858 ts = rt->peer->tcp_ts;
2859 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2860 }
2861 }
2862
2863 if (rt->fl.iif) {
2864#ifdef CONFIG_IP_MROUTE
2865 __be32 dst = rt->rt_dst;
2866
2867 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2868 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2869 int err = ipmr_get_route(net, skb, r, nowait);
2870 if (err <= 0) {
2871 if (!nowait) {
2872 if (err == 0)
2873 return 0;
2874 goto nla_put_failure;
2875 } else {
2876 if (err == -EMSGSIZE)
2877 goto nla_put_failure;
2878 error = err;
2879 }
2880 }
2881 } else
2882#endif
2883 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2884 }
2885
2886 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2887 expires, error) < 0)
2888 goto nla_put_failure;
2889
2890 return nlmsg_end(skb, nlh);
2891
2892nla_put_failure:
2893 nlmsg_cancel(skb, nlh);
2894 return -EMSGSIZE;
2895}
2896
2897static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2898{
2899 struct net *net = sock_net(in_skb->sk);
2900 struct rtmsg *rtm;
2901 struct nlattr *tb[RTA_MAX+1];
2902 struct rtable *rt = NULL;
2903 __be32 dst = 0;
2904 __be32 src = 0;
2905 u32 iif;
2906 int err;
2907 struct sk_buff *skb;
2908
2909 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2910 if (err < 0)
2911 goto errout;
2912
2913 rtm = nlmsg_data(nlh);
2914
2915 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2916 if (skb == NULL) {
2917 err = -ENOBUFS;
2918 goto errout;
2919 }
2920
2921
2922
2923
2924 skb_reset_mac_header(skb);
2925 skb_reset_network_header(skb);
2926
2927
2928 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2929 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2930
2931 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2932 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2933 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2934
2935 if (iif) {
2936 struct net_device *dev;
2937
2938 dev = __dev_get_by_index(net, iif);
2939 if (dev == NULL) {
2940 err = -ENODEV;
2941 goto errout_free;
2942 }
2943
2944 skb->protocol = htons(ETH_P_IP);
2945 skb->dev = dev;
2946 local_bh_disable();
2947 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2948 local_bh_enable();
2949
2950 rt = skb_rtable(skb);
2951 if (err == 0 && rt->u.dst.error)
2952 err = -rt->u.dst.error;
2953 } else {
2954 struct flowi fl = {
2955 .nl_u = {
2956 .ip4_u = {
2957 .daddr = dst,
2958 .saddr = src,
2959 .tos = rtm->rtm_tos,
2960 },
2961 },
2962 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2963 };
2964 err = ip_route_output_key(net, &rt, &fl);
2965 }
2966
2967 if (err)
2968 goto errout_free;
2969
2970 skb_dst_set(skb, &rt->u.dst);
2971 if (rtm->rtm_flags & RTM_F_NOTIFY)
2972 rt->rt_flags |= RTCF_NOTIFY;
2973
2974 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2975 RTM_NEWROUTE, 0, 0);
2976 if (err <= 0)
2977 goto errout_free;
2978
2979 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2980errout:
2981 return err;
2982
2983errout_free:
2984 kfree_skb(skb);
2985 goto errout;
2986}
2987
2988int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2989{
2990 struct rtable *rt;
2991 int h, s_h;
2992 int idx, s_idx;
2993 struct net *net;
2994
2995 net = sock_net(skb->sk);
2996
2997 s_h = cb->args[0];
2998 if (s_h < 0)
2999 s_h = 0;
3000 s_idx = idx = cb->args[1];
3001 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3002 if (!rt_hash_table[h].chain)
3003 continue;
3004 rcu_read_lock_bh();
3005 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
3006 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
3007 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
3008 continue;
3009 if (rt_is_expired(rt))
3010 continue;
3011 skb_dst_set(skb, dst_clone(&rt->u.dst));
3012 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3013 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3014 1, NLM_F_MULTI) <= 0) {
3015 skb_dst_drop(skb);
3016 rcu_read_unlock_bh();
3017 goto done;
3018 }
3019 skb_dst_drop(skb);
3020 }
3021 rcu_read_unlock_bh();
3022 }
3023
3024done:
3025 cb->args[0] = h;
3026 cb->args[1] = idx;
3027 return skb->len;
3028}
3029
3030void ip_rt_multicast_event(struct in_device *in_dev)
3031{
3032 rt_cache_flush(dev_net(in_dev->dev), 0);
3033}
3034
3035#ifdef CONFIG_SYSCTL
3036static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3037 struct file *filp, void __user *buffer,
3038 size_t *lenp, loff_t *ppos)
3039{
3040 if (write) {
3041 int flush_delay;
3042 ctl_table ctl;
3043 struct net *net;
3044
3045 memcpy(&ctl, __ctl, sizeof(ctl));
3046 ctl.data = &flush_delay;
3047 proc_dointvec(&ctl, write, filp, buffer, lenp, ppos);
3048
3049 net = (struct net *)__ctl->extra1;
3050 rt_cache_flush(net, flush_delay);
3051 return 0;
3052 }
3053
3054 return -EINVAL;
3055}
3056
3057static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
3058 void __user *oldval,
3059 size_t __user *oldlenp,
3060 void __user *newval,
3061 size_t newlen)
3062{
3063 int delay;
3064 struct net *net;
3065 if (newlen != sizeof(int))
3066 return -EINVAL;
3067 if (get_user(delay, (int __user *)newval))
3068 return -EFAULT;
3069 net = (struct net *)table->extra1;
3070 rt_cache_flush(net, delay);
3071 return 0;
3072}
3073
3074static void rt_secret_reschedule(int old)
3075{
3076 struct net *net;
3077 int new = ip_rt_secret_interval;
3078 int diff = new - old;
3079
3080 if (!diff)
3081 return;
3082
3083 rtnl_lock();
3084 for_each_net(net) {
3085 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3086
3087 if (!new)
3088 continue;
3089
3090 if (deleted) {
3091 long time = net->ipv4.rt_secret_timer.expires - jiffies;
3092
3093 if (time <= 0 || (time += diff) <= 0)
3094 time = 0;
3095
3096 net->ipv4.rt_secret_timer.expires = time;
3097 } else
3098 net->ipv4.rt_secret_timer.expires = new;
3099
3100 net->ipv4.rt_secret_timer.expires += jiffies;
3101 add_timer(&net->ipv4.rt_secret_timer);
3102 }
3103 rtnl_unlock();
3104}
3105
3106static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3107 struct file *filp,
3108 void __user *buffer, size_t *lenp,
3109 loff_t *ppos)
3110{
3111 int old = ip_rt_secret_interval;
3112 int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos);
3113
3114 rt_secret_reschedule(old);
3115
3116 return ret;
3117}
3118
3119static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table,
3120 void __user *oldval,
3121 size_t __user *oldlenp,
3122 void __user *newval,
3123 size_t newlen)
3124{
3125 int old = ip_rt_secret_interval;
3126 int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen);
3127
3128 rt_secret_reschedule(old);
3129
3130 return ret;
3131}
3132
3133static ctl_table ipv4_route_table[] = {
3134 {
3135 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
3136 .procname = "gc_thresh",
3137 .data = &ipv4_dst_ops.gc_thresh,
3138 .maxlen = sizeof(int),
3139 .mode = 0644,
3140 .proc_handler = proc_dointvec,
3141 },
3142 {
3143 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
3144 .procname = "max_size",
3145 .data = &ip_rt_max_size,
3146 .maxlen = sizeof(int),
3147 .mode = 0644,
3148 .proc_handler = proc_dointvec,
3149 },
3150 {
3151
3152
3153 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
3154 .procname = "gc_min_interval",
3155 .data = &ip_rt_gc_min_interval,
3156 .maxlen = sizeof(int),
3157 .mode = 0644,
3158 .proc_handler = proc_dointvec_jiffies,
3159 .strategy = sysctl_jiffies,
3160 },
3161 {
3162 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
3163 .procname = "gc_min_interval_ms",
3164 .data = &ip_rt_gc_min_interval,
3165 .maxlen = sizeof(int),
3166 .mode = 0644,
3167 .proc_handler = proc_dointvec_ms_jiffies,
3168 .strategy = sysctl_ms_jiffies,
3169 },
3170 {
3171 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
3172 .procname = "gc_timeout",
3173 .data = &ip_rt_gc_timeout,
3174 .maxlen = sizeof(int),
3175 .mode = 0644,
3176 .proc_handler = proc_dointvec_jiffies,
3177 .strategy = sysctl_jiffies,
3178 },
3179 {
3180 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
3181 .procname = "gc_interval",
3182 .data = &ip_rt_gc_interval,
3183 .maxlen = sizeof(int),
3184 .mode = 0644,
3185 .proc_handler = proc_dointvec_jiffies,
3186 .strategy = sysctl_jiffies,
3187 },
3188 {
3189 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
3190 .procname = "redirect_load",
3191 .data = &ip_rt_redirect_load,
3192 .maxlen = sizeof(int),
3193 .mode = 0644,
3194 .proc_handler = proc_dointvec,
3195 },
3196 {
3197 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
3198 .procname = "redirect_number",
3199 .data = &ip_rt_redirect_number,
3200 .maxlen = sizeof(int),
3201 .mode = 0644,
3202 .proc_handler = proc_dointvec,
3203 },
3204 {
3205 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
3206 .procname = "redirect_silence",
3207 .data = &ip_rt_redirect_silence,
3208 .maxlen = sizeof(int),
3209 .mode = 0644,
3210 .proc_handler = proc_dointvec,
3211 },
3212 {
3213 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
3214 .procname = "error_cost",
3215 .data = &ip_rt_error_cost,
3216 .maxlen = sizeof(int),
3217 .mode = 0644,
3218 .proc_handler = proc_dointvec,
3219 },
3220 {
3221 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
3222 .procname = "error_burst",
3223 .data = &ip_rt_error_burst,
3224 .maxlen = sizeof(int),
3225 .mode = 0644,
3226 .proc_handler = proc_dointvec,
3227 },
3228 {
3229 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
3230 .procname = "gc_elasticity",
3231 .data = &ip_rt_gc_elasticity,
3232 .maxlen = sizeof(int),
3233 .mode = 0644,
3234 .proc_handler = proc_dointvec,
3235 },
3236 {
3237 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3238 .procname = "mtu_expires",
3239 .data = &ip_rt_mtu_expires,
3240 .maxlen = sizeof(int),
3241 .mode = 0644,
3242 .proc_handler = proc_dointvec_jiffies,
3243 .strategy = sysctl_jiffies,
3244 },
3245 {
3246 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3247 .procname = "min_pmtu",
3248 .data = &ip_rt_min_pmtu,
3249 .maxlen = sizeof(int),
3250 .mode = 0644,
3251 .proc_handler = proc_dointvec,
3252 },
3253 {
3254 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3255 .procname = "min_adv_mss",
3256 .data = &ip_rt_min_advmss,
3257 .maxlen = sizeof(int),
3258 .mode = 0644,
3259 .proc_handler = proc_dointvec,
3260 },
3261 {
3262 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3263 .procname = "secret_interval",
3264 .data = &ip_rt_secret_interval,
3265 .maxlen = sizeof(int),
3266 .mode = 0644,
3267 .proc_handler = ipv4_sysctl_rt_secret_interval,
3268 .strategy = ipv4_sysctl_rt_secret_interval_strategy,
3269 },
3270 { .ctl_name = 0 }
3271};
3272
3273static struct ctl_table empty[1];
3274
3275static struct ctl_table ipv4_skeleton[] =
3276{
3277 { .procname = "route", .ctl_name = NET_IPV4_ROUTE,
3278 .mode = 0555, .child = ipv4_route_table},
3279 { .procname = "neigh", .ctl_name = NET_IPV4_NEIGH,
3280 .mode = 0555, .child = empty},
3281 { }
3282};
3283
3284static __net_initdata struct ctl_path ipv4_path[] = {
3285 { .procname = "net", .ctl_name = CTL_NET, },
3286 { .procname = "ipv4", .ctl_name = NET_IPV4, },
3287 { },
3288};
3289
3290static struct ctl_table ipv4_route_flush_table[] = {
3291 {
3292 .ctl_name = NET_IPV4_ROUTE_FLUSH,
3293 .procname = "flush",
3294 .maxlen = sizeof(int),
3295 .mode = 0200,
3296 .proc_handler = ipv4_sysctl_rtcache_flush,
3297 .strategy = ipv4_sysctl_rtcache_flush_strategy,
3298 },
3299 { .ctl_name = 0 },
3300};
3301
3302static __net_initdata struct ctl_path ipv4_route_path[] = {
3303 { .procname = "net", .ctl_name = CTL_NET, },
3304 { .procname = "ipv4", .ctl_name = NET_IPV4, },
3305 { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3306 { },
3307};
3308
3309static __net_init int sysctl_route_net_init(struct net *net)
3310{
3311 struct ctl_table *tbl;
3312
3313 tbl = ipv4_route_flush_table;
3314 if (net != &init_net) {
3315 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3316 if (tbl == NULL)
3317 goto err_dup;
3318 }
3319 tbl[0].extra1 = net;
3320
3321 net->ipv4.route_hdr =
3322 register_net_sysctl_table(net, ipv4_route_path, tbl);
3323 if (net->ipv4.route_hdr == NULL)
3324 goto err_reg;
3325 return 0;
3326
3327err_reg:
3328 if (tbl != ipv4_route_flush_table)
3329 kfree(tbl);
3330err_dup:
3331 return -ENOMEM;
3332}
3333
3334static __net_exit void sysctl_route_net_exit(struct net *net)
3335{
3336 struct ctl_table *tbl;
3337
3338 tbl = net->ipv4.route_hdr->ctl_table_arg;
3339 unregister_net_sysctl_table(net->ipv4.route_hdr);
3340 BUG_ON(tbl == ipv4_route_flush_table);
3341 kfree(tbl);
3342}
3343
3344static __net_initdata struct pernet_operations sysctl_route_ops = {
3345 .init = sysctl_route_net_init,
3346 .exit = sysctl_route_net_exit,
3347};
3348#endif
3349
3350
3351static __net_init int rt_secret_timer_init(struct net *net)
3352{
3353 atomic_set(&net->ipv4.rt_genid,
3354 (int) ((num_physpages ^ (num_physpages>>8)) ^
3355 (jiffies ^ (jiffies >> 7))));
3356
3357 net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3358 net->ipv4.rt_secret_timer.data = (unsigned long)net;
3359 init_timer_deferrable(&net->ipv4.rt_secret_timer);
3360
3361 if (ip_rt_secret_interval) {
3362 net->ipv4.rt_secret_timer.expires =
3363 jiffies + net_random() % ip_rt_secret_interval +
3364 ip_rt_secret_interval;
3365 add_timer(&net->ipv4.rt_secret_timer);
3366 }
3367 return 0;
3368}
3369
3370static __net_exit void rt_secret_timer_exit(struct net *net)
3371{
3372 del_timer_sync(&net->ipv4.rt_secret_timer);
3373}
3374
3375static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3376 .init = rt_secret_timer_init,
3377 .exit = rt_secret_timer_exit,
3378};
3379
3380
3381#ifdef CONFIG_NET_CLS_ROUTE
3382struct ip_rt_acct *ip_rt_acct __read_mostly;
3383#endif
3384
3385static __initdata unsigned long rhash_entries;
3386static int __init set_rhash_entries(char *str)
3387{
3388 if (!str)
3389 return 0;
3390 rhash_entries = simple_strtoul(str, &str, 0);
3391 return 1;
3392}
3393__setup("rhash_entries=", set_rhash_entries);
3394
3395int __init ip_rt_init(void)
3396{
3397 int rc = 0;
3398
3399#ifdef CONFIG_NET_CLS_ROUTE
3400 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3401 if (!ip_rt_acct)
3402 panic("IP: failed to allocate ip_rt_acct\n");
3403#endif
3404
3405 ipv4_dst_ops.kmem_cachep =
3406 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3407 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3408
3409 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3410
3411 rt_hash_table = (struct rt_hash_bucket *)
3412 alloc_large_system_hash("IP route cache",
3413 sizeof(struct rt_hash_bucket),
3414 rhash_entries,
3415 (num_physpages >= 128 * 1024) ?
3416 15 : 17,
3417 0,
3418 &rt_hash_log,
3419 &rt_hash_mask,
3420 rhash_entries ? 0 : 512 * 1024);
3421 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3422 rt_hash_lock_init();
3423
3424 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3425 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3426
3427 devinet_init();
3428 ip_fib_init();
3429
3430
3431
3432
3433 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3434 expires_ljiffies = jiffies;
3435 schedule_delayed_work(&expires_work,
3436 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3437
3438 if (register_pernet_subsys(&rt_secret_timer_ops))
3439 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3440
3441 if (ip_rt_proc_init())
3442 printk(KERN_ERR "Unable to create route proc files\n");
3443#ifdef CONFIG_XFRM
3444 xfrm_init();
3445 xfrm4_init();
3446#endif
3447 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3448
3449#ifdef CONFIG_SYSCTL
3450 register_pernet_subsys(&sysctl_route_ops);
3451#endif
3452 return rc;
3453}
3454
3455#ifdef CONFIG_SYSCTL
3456
3457
3458
3459
3460void __init ip_static_sysctl_init(void)
3461{
3462 register_sysctl_paths(ipv4_path, ipv4_skeleton);
3463}
3464#endif
3465
3466EXPORT_SYMBOL(__ip_select_ident);
3467EXPORT_SYMBOL(ip_route_input);
3468EXPORT_SYMBOL(ip_route_output_key);
3469