1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64#include <linux/config.h>
65#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <asm/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
71#include <linux/sched.h>
72#include <linux/mm.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/skbuff.h>
83#include <linux/rtnetlink.h>
84#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
93#include <net/protocol.h>
94#include <net/ip.h>
95#include <net/route.h>
96#include <net/inetpeer.h>
97#include <net/sock.h>
98#include <net/ip_fib.h>
99#include <net/arp.h>
100#include <net/tcp.h>
101#include <net/icmp.h>
102#include <net/xfrm.h>
103#ifdef CONFIG_SYSCTL
104#include <linux/sysctl.h>
105#endif
106
107#define IP_MAX_MTU 0xFFF0
108
109#define RT_GC_TIMEOUT (300*HZ)
110
111int ip_rt_min_delay = 2 * HZ;
112int ip_rt_max_delay = 10 * HZ;
113int ip_rt_max_size;
114int ip_rt_gc_timeout = RT_GC_TIMEOUT;
115int ip_rt_gc_interval = 60 * HZ;
116int ip_rt_gc_min_interval = HZ / 2;
117int ip_rt_redirect_number = 9;
118int ip_rt_redirect_load = HZ / 50;
119int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
120int ip_rt_error_cost = HZ;
121int ip_rt_error_burst = 5 * HZ;
122int ip_rt_gc_elasticity = 8;
123int ip_rt_mtu_expires = 10 * 60 * HZ;
124int ip_rt_min_pmtu = 512 + 20 + 20;
125int ip_rt_min_advmss = 256;
126int ip_rt_secret_interval = 10 * 60 * HZ;
127static unsigned long rt_deadline;
128
129#define RTprint(a...) printk(KERN_DEBUG a)
130
131static struct timer_list rt_flush_timer;
132static struct timer_list rt_periodic_timer;
133static struct timer_list rt_secret_timer;
134
135
136
137
138
139static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140static void ipv4_dst_destroy(struct dst_entry *dst);
141static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
142static void ipv4_link_failure(struct sk_buff *skb);
143static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
144static int rt_garbage_collect(void);
145
146
147struct dst_ops ipv4_dst_ops = {
148 .family = AF_INET,
149 .protocol = __constant_htons(ETH_P_IP),
150 .gc = rt_garbage_collect,
151 .check = ipv4_dst_check,
152 .destroy = ipv4_dst_destroy,
153 .negative_advice = ipv4_negative_advice,
154 .link_failure = ipv4_link_failure,
155 .update_pmtu = ip_rt_update_pmtu,
156 .entry_size = sizeof(struct rtable),
157};
158
159#define ECN_OR_COST(class) TC_PRIO_##class
160
161__u8 ip_tos2prio[16] = {
162 TC_PRIO_BESTEFFORT,
163 ECN_OR_COST(FILLER),
164 TC_PRIO_BESTEFFORT,
165 ECN_OR_COST(BESTEFFORT),
166 TC_PRIO_BULK,
167 ECN_OR_COST(BULK),
168 TC_PRIO_BULK,
169 ECN_OR_COST(BULK),
170 TC_PRIO_INTERACTIVE,
171 ECN_OR_COST(INTERACTIVE),
172 TC_PRIO_INTERACTIVE,
173 ECN_OR_COST(INTERACTIVE),
174 TC_PRIO_INTERACTIVE_BULK,
175 ECN_OR_COST(INTERACTIVE_BULK),
176 TC_PRIO_INTERACTIVE_BULK,
177 ECN_OR_COST(INTERACTIVE_BULK)
178};
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195struct rt_hash_bucket {
196 struct rtable *chain;
197 spinlock_t lock;
198} __attribute__((__aligned__(8)));
199
200static struct rt_hash_bucket *rt_hash_table;
201static unsigned rt_hash_mask;
202static int rt_hash_log;
203static unsigned int rt_hash_rnd;
204
205struct rt_cache_stat *rt_cache_stat;
206
207static int rt_intern_hash(unsigned hash, struct rtable *rth,
208 struct rtable **res);
209
210static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
211{
212 return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
213 & rt_hash_mask);
214}
215
216#ifdef CONFIG_PROC_FS
217struct rt_cache_iter_state {
218 int bucket;
219};
220
221static struct rtable *rt_cache_get_first(struct seq_file *seq)
222{
223 struct rtable *r = NULL;
224 struct rt_cache_iter_state *st = seq->private;
225
226 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
227 rcu_read_lock();
228 r = rt_hash_table[st->bucket].chain;
229 if (r)
230 break;
231 rcu_read_unlock();
232 }
233 return r;
234}
235
236static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
237{
238 struct rt_cache_iter_state *st = seq->private;
239
240 smp_read_barrier_depends();
241 r = r->u.rt_next;
242 while (!r) {
243 rcu_read_unlock();
244 if (--st->bucket < 0)
245 break;
246 rcu_read_lock();
247 r = rt_hash_table[st->bucket].chain;
248 }
249 return r;
250}
251
252static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
253{
254 struct rtable *r = rt_cache_get_first(seq);
255
256 if (r)
257 while (pos && (r = rt_cache_get_next(seq, r)))
258 --pos;
259 return pos ? NULL : r;
260}
261
262static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
263{
264 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
265}
266
267static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
268{
269 struct rtable *r = NULL;
270
271 if (v == SEQ_START_TOKEN)
272 r = rt_cache_get_first(seq);
273 else
274 r = rt_cache_get_next(seq, v);
275 ++*pos;
276 return r;
277}
278
279static void rt_cache_seq_stop(struct seq_file *seq, void *v)
280{
281 if (v && v != SEQ_START_TOKEN)
282 rcu_read_unlock();
283}
284
285static int rt_cache_seq_show(struct seq_file *seq, void *v)
286{
287 if (v == SEQ_START_TOKEN)
288 seq_printf(seq, "%-127s\n",
289 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
290 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
291 "HHUptod\tSpecDst");
292 else {
293 struct rtable *r = v;
294 char temp[256];
295
296 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
297 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
298 r->u.dst.dev ? r->u.dst.dev->name : "*",
299 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
300 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
301 r->u.dst.__use, 0, (unsigned long)r->rt_src,
302 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
303 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
304 dst_metric(&r->u.dst, RTAX_WINDOW),
305 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
306 dst_metric(&r->u.dst, RTAX_RTTVAR)),
307 r->fl.fl4_tos,
308 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
309 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
310 dev_queue_xmit) : 0,
311 r->rt_spec_dst);
312 seq_printf(seq, "%-127s\n", temp);
313 }
314 return 0;
315}
316
317static struct seq_operations rt_cache_seq_ops = {
318 .start = rt_cache_seq_start,
319 .next = rt_cache_seq_next,
320 .stop = rt_cache_seq_stop,
321 .show = rt_cache_seq_show,
322};
323
324static int rt_cache_seq_open(struct inode *inode, struct file *file)
325{
326 struct seq_file *seq;
327 int rc = -ENOMEM;
328 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
329
330 if (!s)
331 goto out;
332 rc = seq_open(file, &rt_cache_seq_ops);
333 if (rc)
334 goto out_kfree;
335 seq = file->private_data;
336 seq->private = s;
337 memset(s, 0, sizeof(*s));
338out:
339 return rc;
340out_kfree:
341 kfree(s);
342 goto out;
343}
344
345static struct file_operations rt_cache_seq_fops = {
346 .owner = THIS_MODULE,
347 .open = rt_cache_seq_open,
348 .read = seq_read,
349 .llseek = seq_lseek,
350 .release = seq_release_private,
351};
352
353
354static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
355{
356 int cpu;
357
358 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
359 if (!cpu_possible(cpu))
360 continue;
361 *pos = cpu;
362 return per_cpu_ptr(rt_cache_stat, cpu);
363 }
364 return NULL;
365}
366
367static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
368{
369 int cpu;
370
371 for (cpu = *pos + 1; cpu < NR_CPUS; ++cpu) {
372 if (!cpu_possible(cpu))
373 continue;
374 *pos = cpu;
375 return per_cpu_ptr(rt_cache_stat, cpu);
376 }
377 return NULL;
378
379}
380
381static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
382{
383
384}
385
386static int rt_cpu_seq_show(struct seq_file *seq, void *v)
387{
388 struct rt_cache_stat *st = v;
389
390 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
391 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
392 atomic_read(&ipv4_dst_ops.entries),
393 st->in_hit,
394 st->in_slow_tot,
395 st->in_slow_mc,
396 st->in_no_route,
397 st->in_brd,
398 st->in_martian_dst,
399 st->in_martian_src,
400
401 st->out_hit,
402 st->out_slow_tot,
403 st->out_slow_mc,
404
405 st->gc_total,
406 st->gc_ignored,
407 st->gc_goal_miss,
408 st->gc_dst_overflow,
409 st->in_hlist_search,
410 st->out_hlist_search
411 );
412 return 0;
413}
414
415static struct seq_operations rt_cpu_seq_ops = {
416 .start = rt_cpu_seq_start,
417 .next = rt_cpu_seq_next,
418 .stop = rt_cpu_seq_stop,
419 .show = rt_cpu_seq_show,
420};
421
422
423static int rt_cpu_seq_open(struct inode *inode, struct file *file)
424{
425 return seq_open(file, &rt_cpu_seq_ops);
426}
427
428static struct file_operations rt_cpu_seq_fops = {
429 .owner = THIS_MODULE,
430 .open = rt_cpu_seq_open,
431 .read = seq_read,
432 .llseek = seq_lseek,
433 .release = seq_release_private,
434};
435
436#endif
437
438static __inline__ void rt_free(struct rtable *rt)
439{
440 call_rcu(&rt->u.dst.rcu_head, (void (*)(void *))dst_free, &rt->u.dst);
441}
442
443static __inline__ void rt_drop(struct rtable *rt)
444{
445 ip_rt_put(rt);
446 call_rcu(&rt->u.dst.rcu_head, (void (*)(void *))dst_free, &rt->u.dst);
447}
448
449static __inline__ int rt_fast_clean(struct rtable *rth)
450{
451
452
453 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
454 rth->fl.iif && rth->u.rt_next;
455}
456
457static __inline__ int rt_valuable(struct rtable *rth)
458{
459 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
460 rth->u.dst.expires;
461}
462
463static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
464{
465 unsigned long age;
466 int ret = 0;
467
468 if (atomic_read(&rth->u.dst.__refcnt))
469 goto out;
470
471 ret = 1;
472 if (rth->u.dst.expires &&
473 time_after_eq(jiffies, rth->u.dst.expires))
474 goto out;
475
476 age = jiffies - rth->u.dst.lastuse;
477 ret = 0;
478 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
479 (age <= tmo2 && rt_valuable(rth)))
480 goto out;
481 ret = 1;
482out: return ret;
483}
484
485
486
487
488
489
490static inline u32 rt_score(struct rtable *rt)
491{
492 u32 score = jiffies - rt->u.dst.lastuse;
493
494 score = ~score & ~(3<<30);
495
496 if (rt_valuable(rt))
497 score |= (1<<31);
498
499 if (!rt->fl.iif ||
500 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
501 score |= (1<<30);
502
503 return score;
504}
505
506
507static void rt_check_expire(unsigned long dummy)
508{
509 static int rover;
510 int i = rover, t;
511 struct rtable *rth, **rthp;
512 unsigned long now = jiffies;
513
514 for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
515 t -= ip_rt_gc_timeout) {
516 unsigned long tmo = ip_rt_gc_timeout;
517
518 i = (i + 1) & rt_hash_mask;
519 rthp = &rt_hash_table[i].chain;
520
521 spin_lock(&rt_hash_table[i].lock);
522 while ((rth = *rthp) != NULL) {
523 if (rth->u.dst.expires) {
524
525 if (time_before_eq(now, rth->u.dst.expires)) {
526 tmo >>= 1;
527 rthp = &rth->u.rt_next;
528 continue;
529 }
530 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
531 tmo >>= 1;
532 rthp = &rth->u.rt_next;
533 continue;
534 }
535
536
537 *rthp = rth->u.rt_next;
538 rt_free(rth);
539 }
540 spin_unlock(&rt_hash_table[i].lock);
541
542
543 if (time_after(jiffies, now))
544 break;
545 }
546 rover = i;
547 mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
548}
549
550
551
552
553static void rt_run_flush(unsigned long dummy)
554{
555 int i;
556 struct rtable *rth, *next;
557
558 rt_deadline = 0;
559
560 get_random_bytes(&rt_hash_rnd, 4);
561
562 for (i = rt_hash_mask; i >= 0; i--) {
563 spin_lock_bh(&rt_hash_table[i].lock);
564 rth = rt_hash_table[i].chain;
565 if (rth)
566 rt_hash_table[i].chain = NULL;
567 spin_unlock_bh(&rt_hash_table[i].lock);
568
569 for (; rth; rth = next) {
570 next = rth->u.rt_next;
571 rt_free(rth);
572 }
573 }
574}
575
576static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
577
578void rt_cache_flush(int delay)
579{
580 unsigned long now = jiffies;
581 int user_mode = !in_softirq();
582
583 if (delay < 0)
584 delay = ip_rt_min_delay;
585
586 spin_lock_bh(&rt_flush_lock);
587
588 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
589 long tmo = (long)(rt_deadline - now);
590
591
592
593
594
595
596
597
598 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
599 tmo = 0;
600
601 if (delay > tmo)
602 delay = tmo;
603 }
604
605 if (delay <= 0) {
606 spin_unlock_bh(&rt_flush_lock);
607 rt_run_flush(0);
608 return;
609 }
610
611 if (rt_deadline == 0)
612 rt_deadline = now + ip_rt_max_delay;
613
614 mod_timer(&rt_flush_timer, now+delay);
615 spin_unlock_bh(&rt_flush_lock);
616}
617
618static void rt_secret_rebuild(unsigned long dummy)
619{
620 unsigned long now = jiffies;
621
622 rt_cache_flush(0);
623 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
624}
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639static int rt_garbage_collect(void)
640{
641 static unsigned long expire = RT_GC_TIMEOUT;
642 static unsigned long last_gc;
643 static int rover;
644 static int equilibrium;
645 struct rtable *rth, **rthp;
646 unsigned long now = jiffies;
647 int goal;
648
649
650
651
652
653
654 RT_CACHE_STAT_INC(gc_total);
655
656 if (now - last_gc < ip_rt_gc_min_interval &&
657 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
658 RT_CACHE_STAT_INC(gc_ignored);
659 goto out;
660 }
661
662
663 goal = atomic_read(&ipv4_dst_ops.entries) -
664 (ip_rt_gc_elasticity << rt_hash_log);
665 if (goal <= 0) {
666 if (equilibrium < ipv4_dst_ops.gc_thresh)
667 equilibrium = ipv4_dst_ops.gc_thresh;
668 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
669 if (goal > 0) {
670 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
671 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
672 }
673 } else {
674
675
676
677 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
678 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
679 }
680
681 if (now - last_gc >= ip_rt_gc_min_interval)
682 last_gc = now;
683
684 if (goal <= 0) {
685 equilibrium += goal;
686 goto work_done;
687 }
688
689 do {
690 int i, k;
691
692 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
693 unsigned long tmo = expire;
694
695 k = (k + 1) & rt_hash_mask;
696 rthp = &rt_hash_table[k].chain;
697 spin_lock_bh(&rt_hash_table[k].lock);
698 while ((rth = *rthp) != NULL) {
699 if (!rt_may_expire(rth, tmo, expire)) {
700 tmo >>= 1;
701 rthp = &rth->u.rt_next;
702 continue;
703 }
704 *rthp = rth->u.rt_next;
705 rt_free(rth);
706 goal--;
707 }
708 spin_unlock_bh(&rt_hash_table[k].lock);
709 if (goal <= 0)
710 break;
711 }
712 rover = k;
713
714 if (goal <= 0)
715 goto work_done;
716
717
718
719
720
721
722
723
724
725
726 RT_CACHE_STAT_INC(gc_goal_miss);
727
728 if (expire == 0)
729 break;
730
731 expire >>= 1;
732#if RT_CACHE_DEBUG >= 2
733 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
734 atomic_read(&ipv4_dst_ops.entries), goal, i);
735#endif
736
737 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
738 goto out;
739 } while (!in_softirq() && time_before_eq(jiffies, now));
740
741 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
742 goto out;
743 if (net_ratelimit())
744 printk(KERN_WARNING "dst cache overflow\n");
745 RT_CACHE_STAT_INC(gc_dst_overflow);
746 return 1;
747
748work_done:
749 expire += ip_rt_gc_min_interval;
750 if (expire > ip_rt_gc_timeout ||
751 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
752 expire = ip_rt_gc_timeout;
753#if RT_CACHE_DEBUG >= 2
754 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
755 atomic_read(&ipv4_dst_ops.entries), goal, rover);
756#endif
757out: return 0;
758}
759
760static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
761{
762 return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
763 fl1->oif == fl2->oif &&
764 fl1->iif == fl2->iif;
765}
766
767static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
768{
769 struct rtable *rth, **rthp;
770 unsigned long now;
771 struct rtable *cand, **candp;
772 u32 min_score;
773 int chain_length;
774 int attempts = !in_softirq();
775
776restart:
777 chain_length = 0;
778 min_score = ~(u32)0;
779 cand = NULL;
780 candp = NULL;
781 now = jiffies;
782
783 rthp = &rt_hash_table[hash].chain;
784
785 spin_lock_bh(&rt_hash_table[hash].lock);
786 while ((rth = *rthp) != NULL) {
787 if (compare_keys(&rth->fl, &rt->fl)) {
788
789 *rthp = rth->u.rt_next;
790
791
792
793
794
795 smp_wmb();
796 rth->u.rt_next = rt_hash_table[hash].chain;
797
798
799
800
801 smp_wmb();
802 rt_hash_table[hash].chain = rth;
803
804 rth->u.dst.__use++;
805 dst_hold(&rth->u.dst);
806 rth->u.dst.lastuse = now;
807 spin_unlock_bh(&rt_hash_table[hash].lock);
808
809 rt_drop(rt);
810 *rp = rth;
811 return 0;
812 }
813
814 if (!atomic_read(&rth->u.dst.__refcnt)) {
815 u32 score = rt_score(rth);
816
817 if (score <= min_score) {
818 cand = rth;
819 candp = rthp;
820 min_score = score;
821 }
822 }
823
824 chain_length++;
825
826 rthp = &rth->u.rt_next;
827 }
828
829 if (cand) {
830
831
832
833
834
835
836 if (chain_length > ip_rt_gc_elasticity) {
837 *candp = cand->u.rt_next;
838 rt_free(cand);
839 }
840 }
841
842
843
844
845 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
846 int err = arp_bind_neighbour(&rt->u.dst);
847 if (err) {
848 spin_unlock_bh(&rt_hash_table[hash].lock);
849
850 if (err != -ENOBUFS) {
851 rt_drop(rt);
852 return err;
853 }
854
855
856
857
858
859 if (attempts-- > 0) {
860 int saved_elasticity = ip_rt_gc_elasticity;
861 int saved_int = ip_rt_gc_min_interval;
862 ip_rt_gc_elasticity = 1;
863 ip_rt_gc_min_interval = 0;
864 rt_garbage_collect();
865 ip_rt_gc_min_interval = saved_int;
866 ip_rt_gc_elasticity = saved_elasticity;
867 goto restart;
868 }
869
870 if (net_ratelimit())
871 printk(KERN_WARNING "Neighbour table overflow.\n");
872 rt_drop(rt);
873 return -ENOBUFS;
874 }
875 }
876
877 rt->u.rt_next = rt_hash_table[hash].chain;
878#if RT_CACHE_DEBUG >= 2
879 if (rt->u.rt_next) {
880 struct rtable *trt;
881 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
882 NIPQUAD(rt->rt_dst));
883 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
884 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
885 printk("\n");
886 }
887#endif
888 rt_hash_table[hash].chain = rt;
889 spin_unlock_bh(&rt_hash_table[hash].lock);
890 *rp = rt;
891 return 0;
892}
893
894void rt_bind_peer(struct rtable *rt, int create)
895{
896 static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED;
897 struct inet_peer *peer;
898
899 peer = inet_getpeer(rt->rt_dst, create);
900
901 spin_lock_bh(&rt_peer_lock);
902 if (rt->peer == NULL) {
903 rt->peer = peer;
904 peer = NULL;
905 }
906 spin_unlock_bh(&rt_peer_lock);
907 if (peer)
908 inet_putpeer(peer);
909}
910
911
912
913
914
915
916
917
918static void ip_select_fb_ident(struct iphdr *iph)
919{
920 static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED;
921 static u32 ip_fallback_id;
922 u32 salt;
923
924 spin_lock_bh(&ip_fb_id_lock);
925 salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
926 iph->id = htons(salt & 0xFFFF);
927 ip_fallback_id = salt;
928 spin_unlock_bh(&ip_fb_id_lock);
929}
930
931void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
932{
933 struct rtable *rt = (struct rtable *) dst;
934
935 if (rt) {
936 if (rt->peer == NULL)
937 rt_bind_peer(rt, 1);
938
939
940
941
942 if (rt->peer) {
943 iph->id = htons(inet_getid(rt->peer, more));
944 return;
945 }
946 } else
947 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
948
949 ip_select_fb_ident(iph);
950}
951
952static void rt_del(unsigned hash, struct rtable *rt)
953{
954 struct rtable **rthp;
955
956 spin_lock_bh(&rt_hash_table[hash].lock);
957 ip_rt_put(rt);
958 for (rthp = &rt_hash_table[hash].chain; *rthp;
959 rthp = &(*rthp)->u.rt_next)
960 if (*rthp == rt) {
961 *rthp = rt->u.rt_next;
962 rt_free(rt);
963 break;
964 }
965 spin_unlock_bh(&rt_hash_table[hash].lock);
966}
967
968void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
969 u32 saddr, u8 tos, struct net_device *dev)
970{
971 int i, k;
972 struct in_device *in_dev = in_dev_get(dev);
973 struct rtable *rth, **rthp;
974 u32 skeys[2] = { saddr, 0 };
975 int ikeys[2] = { dev->ifindex, 0 };
976
977 tos &= IPTOS_RT_MASK;
978
979 if (!in_dev)
980 return;
981
982 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
983 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
984 goto reject_redirect;
985
986 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
987 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
988 goto reject_redirect;
989 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
990 goto reject_redirect;
991 } else {
992 if (inet_addr_type(new_gw) != RTN_UNICAST)
993 goto reject_redirect;
994 }
995
996 for (i = 0; i < 2; i++) {
997 for (k = 0; k < 2; k++) {
998 unsigned hash = rt_hash_code(daddr,
999 skeys[i] ^ (ikeys[k] << 5),
1000 tos);
1001
1002 rthp=&rt_hash_table[hash].chain;
1003
1004 rcu_read_lock();
1005 while ((rth = *rthp) != NULL) {
1006 struct rtable *rt;
1007
1008 smp_read_barrier_depends();
1009 if (rth->fl.fl4_dst != daddr ||
1010 rth->fl.fl4_src != skeys[i] ||
1011 rth->fl.fl4_tos != tos ||
1012 rth->fl.oif != ikeys[k] ||
1013 rth->fl.iif != 0) {
1014 rthp = &rth->u.rt_next;
1015 continue;
1016 }
1017
1018 if (rth->rt_dst != daddr ||
1019 rth->rt_src != saddr ||
1020 rth->u.dst.error ||
1021 rth->rt_gateway != old_gw ||
1022 rth->u.dst.dev != dev)
1023 break;
1024
1025 dst_hold(&rth->u.dst);
1026 rcu_read_unlock();
1027
1028 rt = dst_alloc(&ipv4_dst_ops);
1029 if (rt == NULL) {
1030 ip_rt_put(rth);
1031 in_dev_put(in_dev);
1032 return;
1033 }
1034
1035
1036 *rt = *rth;
1037 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1038 rt->u.dst.__use = 1;
1039 atomic_set(&rt->u.dst.__refcnt, 1);
1040 rt->u.dst.child = NULL;
1041 if (rt->u.dst.dev)
1042 dev_hold(rt->u.dst.dev);
1043 rt->u.dst.obsolete = 0;
1044 rt->u.dst.lastuse = jiffies;
1045 rt->u.dst.path = &rt->u.dst;
1046 rt->u.dst.neighbour = NULL;
1047 rt->u.dst.hh = NULL;
1048 rt->u.dst.xfrm = NULL;
1049
1050 rt->rt_flags |= RTCF_REDIRECTED;
1051
1052
1053 rt->rt_gateway = new_gw;
1054
1055
1056 dst_confirm(&rth->u.dst);
1057
1058 if (rt->peer)
1059 atomic_inc(&rt->peer->refcnt);
1060
1061 if (arp_bind_neighbour(&rt->u.dst) ||
1062 !(rt->u.dst.neighbour->nud_state &
1063 NUD_VALID)) {
1064 if (rt->u.dst.neighbour)
1065 neigh_event_send(rt->u.dst.neighbour, NULL);
1066 ip_rt_put(rth);
1067 rt_drop(rt);
1068 goto do_next;
1069 }
1070
1071 rt_del(hash, rth);
1072 if (!rt_intern_hash(hash, rt, &rt))
1073 ip_rt_put(rt);
1074 goto do_next;
1075 }
1076 rcu_read_unlock();
1077 do_next:
1078 ;
1079 }
1080 }
1081 in_dev_put(in_dev);
1082 return;
1083
1084reject_redirect:
1085#ifdef CONFIG_IP_ROUTE_VERBOSE
1086 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1087 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1088 "%u.%u.%u.%u ignored.\n"
1089 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1090 "tos %02x\n",
1091 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1092 NIPQUAD(saddr), NIPQUAD(daddr), tos);
1093#endif
1094 in_dev_put(in_dev);
1095}
1096
1097static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1098{
1099 struct rtable *rt = (struct rtable*)dst;
1100 struct dst_entry *ret = dst;
1101
1102 if (rt) {
1103 if (dst->obsolete) {
1104 ip_rt_put(rt);
1105 ret = NULL;
1106 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1107 rt->u.dst.expires) {
1108 unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1109 rt->fl.fl4_src ^
1110 (rt->fl.oif << 5),
1111 rt->fl.fl4_tos);
1112#if RT_CACHE_DEBUG >= 1
1113 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1114 "%u.%u.%u.%u/%02x dropped\n",
1115 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1116#endif
1117 rt_del(hash, rt);
1118 ret = NULL;
1119 }
1120 }
1121 return ret;
1122}
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140void ip_rt_send_redirect(struct sk_buff *skb)
1141{
1142 struct rtable *rt = (struct rtable*)skb->dst;
1143 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1144
1145 if (!in_dev)
1146 return;
1147
1148 if (!IN_DEV_TX_REDIRECTS(in_dev))
1149 goto out;
1150
1151
1152
1153
1154 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1155 rt->u.dst.rate_tokens = 0;
1156
1157
1158
1159
1160 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1161 rt->u.dst.rate_last = jiffies;
1162 goto out;
1163 }
1164
1165
1166
1167
1168 if (time_after(jiffies,
1169 (rt->u.dst.rate_last +
1170 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1171 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1172 rt->u.dst.rate_last = jiffies;
1173 ++rt->u.dst.rate_tokens;
1174#ifdef CONFIG_IP_ROUTE_VERBOSE
1175 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1176 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1177 net_ratelimit())
1178 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1179 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1180 NIPQUAD(rt->rt_src), rt->rt_iif,
1181 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1182#endif
1183 }
1184out:
1185 in_dev_put(in_dev);
1186}
1187
1188static int ip_error(struct sk_buff *skb)
1189{
1190 struct rtable *rt = (struct rtable*)skb->dst;
1191 unsigned long now;
1192 int code;
1193
1194 switch (rt->u.dst.error) {
1195 case EINVAL:
1196 default:
1197 goto out;
1198 case EHOSTUNREACH:
1199 code = ICMP_HOST_UNREACH;
1200 break;
1201 case ENETUNREACH:
1202 code = ICMP_NET_UNREACH;
1203 break;
1204 case EACCES:
1205 code = ICMP_PKT_FILTERED;
1206 break;
1207 }
1208
1209 now = jiffies;
1210 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1211 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1212 rt->u.dst.rate_tokens = ip_rt_error_burst;
1213 rt->u.dst.rate_last = now;
1214 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1215 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1216 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1217 }
1218
1219out: kfree_skb(skb);
1220 return 0;
1221}
1222
1223
1224
1225
1226
1227
1228static unsigned short mtu_plateau[] =
1229{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1230
1231static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1232{
1233 int i;
1234
1235 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1236 if (old_mtu > mtu_plateau[i])
1237 return mtu_plateau[i];
1238 return 68;
1239}
1240
1241unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1242{
1243 int i;
1244 unsigned short old_mtu = ntohs(iph->tot_len);
1245 struct rtable *rth;
1246 u32 skeys[2] = { iph->saddr, 0, };
1247 u32 daddr = iph->daddr;
1248 u8 tos = iph->tos & IPTOS_RT_MASK;
1249 unsigned short est_mtu = 0;
1250
1251 if (ipv4_config.no_pmtu_disc)
1252 return 0;
1253
1254 for (i = 0; i < 2; i++) {
1255 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1256
1257 rcu_read_lock();
1258 for (rth = rt_hash_table[hash].chain; rth;
1259 rth = rth->u.rt_next) {
1260 smp_read_barrier_depends();
1261 if (rth->fl.fl4_dst == daddr &&
1262 rth->fl.fl4_src == skeys[i] &&
1263 rth->rt_dst == daddr &&
1264 rth->rt_src == iph->saddr &&
1265 rth->fl.fl4_tos == tos &&
1266 rth->fl.iif == 0 &&
1267 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1268 unsigned short mtu = new_mtu;
1269
1270 if (new_mtu < 68 || new_mtu >= old_mtu) {
1271
1272
1273 if (mtu == 0 &&
1274 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1275 old_mtu >= 68 + (iph->ihl << 2))
1276 old_mtu -= iph->ihl << 2;
1277
1278 mtu = guess_mtu(old_mtu);
1279 }
1280 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1281 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1282 dst_confirm(&rth->u.dst);
1283 if (mtu < ip_rt_min_pmtu) {
1284 mtu = ip_rt_min_pmtu;
1285 rth->u.dst.metrics[RTAX_LOCK-1] |=
1286 (1 << RTAX_MTU);
1287 }
1288 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1289 dst_set_expires(&rth->u.dst,
1290 ip_rt_mtu_expires);
1291 }
1292 est_mtu = mtu;
1293 }
1294 }
1295 }
1296 rcu_read_unlock();
1297 }
1298 return est_mtu ? : new_mtu;
1299}
1300
1301static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1302{
1303 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1304 !(dst_metric_locked(dst, RTAX_MTU))) {
1305 if (mtu < ip_rt_min_pmtu) {
1306 mtu = ip_rt_min_pmtu;
1307 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1308 }
1309 dst->metrics[RTAX_MTU-1] = mtu;
1310 dst_set_expires(dst, ip_rt_mtu_expires);
1311 }
1312}
1313
1314static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1315{
1316 dst_release(dst);
1317 return NULL;
1318}
1319
1320static void ipv4_dst_destroy(struct dst_entry *dst)
1321{
1322 struct rtable *rt = (struct rtable *) dst;
1323 struct inet_peer *peer = rt->peer;
1324
1325 if (peer) {
1326 rt->peer = NULL;
1327 inet_putpeer(peer);
1328 }
1329}
1330
1331static void ipv4_link_failure(struct sk_buff *skb)
1332{
1333 struct rtable *rt;
1334
1335 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1336
1337 rt = (struct rtable *) skb->dst;
1338 if (rt)
1339 dst_set_expires(&rt->u.dst, 0);
1340}
1341
1342static int ip_rt_bug(struct sk_buff *skb)
1343{
1344 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1345 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1346 skb->dev ? skb->dev->name : "?");
1347 kfree_skb(skb);
1348 return 0;
1349}
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360void ip_rt_get_source(u8 *addr, struct rtable *rt)
1361{
1362 u32 src;
1363 struct fib_result res;
1364
1365 if (rt->fl.iif == 0)
1366 src = rt->rt_src;
1367 else if (fib_lookup(&rt->fl, &res) == 0) {
1368#ifdef CONFIG_IP_ROUTE_NAT
1369 if (res.type == RTN_NAT)
1370 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1371 RT_SCOPE_UNIVERSE);
1372 else
1373#endif
1374 src = FIB_RES_PREFSRC(res);
1375 fib_res_put(&res);
1376 } else
1377 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1378 RT_SCOPE_UNIVERSE);
1379 memcpy(addr, &src, 4);
1380}
1381
1382#ifdef CONFIG_NET_CLS_ROUTE
1383static void set_class_tag(struct rtable *rt, u32 tag)
1384{
1385 if (!(rt->u.dst.tclassid & 0xFFFF))
1386 rt->u.dst.tclassid |= tag & 0xFFFF;
1387 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1388 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1389}
1390#endif
1391
1392static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1393{
1394 struct fib_info *fi = res->fi;
1395
1396 if (fi) {
1397 if (FIB_RES_GW(*res) &&
1398 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1399 rt->rt_gateway = FIB_RES_GW(*res);
1400 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1401 sizeof(rt->u.dst.metrics));
1402 if (fi->fib_mtu == 0) {
1403 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1404 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1405 rt->rt_gateway != rt->rt_dst &&
1406 rt->u.dst.dev->mtu > 576)
1407 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1408 }
1409#ifdef CONFIG_NET_CLS_ROUTE
1410 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1411#endif
1412 } else
1413 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1414
1415 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1416 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1417 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1418 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1419 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1420 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1421 ip_rt_min_advmss);
1422 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1423 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1424
1425#ifdef CONFIG_NET_CLS_ROUTE
1426#ifdef CONFIG_IP_MULTIPLE_TABLES
1427 set_class_tag(rt, fib_rules_tclass(res));
1428#endif
1429 set_class_tag(rt, itag);
1430#endif
1431 rt->rt_type = res->type;
1432}
1433
1434static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1435 u8 tos, struct net_device *dev, int our)
1436{
1437 unsigned hash;
1438 struct rtable *rth;
1439 u32 spec_dst;
1440 struct in_device *in_dev = in_dev_get(dev);
1441 u32 itag = 0;
1442
1443
1444
1445 if (in_dev == NULL)
1446 return -EINVAL;
1447
1448 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1449 skb->protocol != htons(ETH_P_IP))
1450 goto e_inval;
1451
1452 if (ZERONET(saddr)) {
1453 if (!LOCAL_MCAST(daddr))
1454 goto e_inval;
1455 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1456 } else if (fib_validate_source(saddr, 0, tos, 0,
1457 dev, &spec_dst, &itag) < 0)
1458 goto e_inval;
1459
1460 rth = dst_alloc(&ipv4_dst_ops);
1461 if (!rth)
1462 goto e_nobufs;
1463
1464 rth->u.dst.output= ip_rt_bug;
1465
1466 atomic_set(&rth->u.dst.__refcnt, 1);
1467 rth->u.dst.flags= DST_HOST;
1468 if (in_dev->cnf.no_policy)
1469 rth->u.dst.flags |= DST_NOPOLICY;
1470 rth->fl.fl4_dst = daddr;
1471 rth->rt_dst = daddr;
1472 rth->fl.fl4_tos = tos;
1473#ifdef CONFIG_IP_ROUTE_FWMARK
1474 rth->fl.fl4_fwmark= skb->nfmark;
1475#endif
1476 rth->fl.fl4_src = saddr;
1477 rth->rt_src = saddr;
1478#ifdef CONFIG_IP_ROUTE_NAT
1479 rth->rt_dst_map = daddr;
1480 rth->rt_src_map = saddr;
1481#endif
1482#ifdef CONFIG_NET_CLS_ROUTE
1483 rth->u.dst.tclassid = itag;
1484#endif
1485 rth->rt_iif =
1486 rth->fl.iif = dev->ifindex;
1487 rth->u.dst.dev = &loopback_dev;
1488 dev_hold(rth->u.dst.dev);
1489 rth->fl.oif = 0;
1490 rth->rt_gateway = daddr;
1491 rth->rt_spec_dst= spec_dst;
1492 rth->rt_type = RTN_MULTICAST;
1493 rth->rt_flags = RTCF_MULTICAST;
1494 if (our) {
1495 rth->u.dst.input= ip_local_deliver;
1496 rth->rt_flags |= RTCF_LOCAL;
1497 }
1498
1499#ifdef CONFIG_IP_MROUTE
1500 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1501 rth->u.dst.input = ip_mr_input;
1502#endif
1503 RT_CACHE_STAT_INC(in_slow_mc);
1504
1505 in_dev_put(in_dev);
1506 hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1507 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1508
1509e_nobufs:
1510 in_dev_put(in_dev);
1511 return -ENOBUFS;
1512
1513e_inval:
1514 in_dev_put(in_dev);
1515 return -EINVAL;
1516}
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1529 u8 tos, struct net_device *dev)
1530{
1531 struct fib_result res;
1532 struct in_device *in_dev = in_dev_get(dev);
1533 struct in_device *out_dev = NULL;
1534 struct flowi fl = { .nl_u = { .ip4_u =
1535 { .daddr = daddr,
1536 .saddr = saddr,
1537 .tos = tos,
1538 .scope = RT_SCOPE_UNIVERSE,
1539#ifdef CONFIG_IP_ROUTE_FWMARK
1540 .fwmark = skb->nfmark
1541#endif
1542 } },
1543 .iif = dev->ifindex };
1544 unsigned flags = 0;
1545 u32 itag = 0;
1546 struct rtable * rth;
1547 unsigned hash;
1548 u32 spec_dst;
1549 int err = -EINVAL;
1550 int free_res = 0;
1551
1552
1553
1554 if (!in_dev)
1555 goto out;
1556
1557 hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
1558
1559
1560
1561
1562
1563 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1564 goto martian_source;
1565
1566 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1567 goto brd_input;
1568
1569
1570
1571
1572 if (ZERONET(saddr))
1573 goto martian_source;
1574
1575 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1576 goto martian_destination;
1577
1578
1579
1580
1581 if ((err = fib_lookup(&fl, &res)) != 0) {
1582 if (!IN_DEV_FORWARD(in_dev))
1583 goto e_inval;
1584 goto no_route;
1585 }
1586 free_res = 1;
1587
1588 RT_CACHE_STAT_INC(in_slow_tot);
1589
1590#ifdef CONFIG_IP_ROUTE_NAT
1591
1592
1593
1594
1595 if (1) {
1596 u32 src_map = saddr;
1597 if (res.r)
1598 src_map = fib_rules_policy(saddr, &res, &flags);
1599
1600 if (res.type == RTN_NAT) {
1601 fl.fl4_dst = fib_rules_map_destination(daddr, &res);
1602 fib_res_put(&res);
1603 free_res = 0;
1604 if (fib_lookup(&fl, &res))
1605 goto e_inval;
1606 free_res = 1;
1607 if (res.type != RTN_UNICAST)
1608 goto e_inval;
1609 flags |= RTCF_DNAT;
1610 }
1611 fl.fl4_src = src_map;
1612 }
1613#endif
1614
1615 if (res.type == RTN_BROADCAST)
1616 goto brd_input;
1617
1618 if (res.type == RTN_LOCAL) {
1619 int result;
1620 result = fib_validate_source(saddr, daddr, tos,
1621 loopback_dev.ifindex,
1622 dev, &spec_dst, &itag);
1623 if (result < 0)
1624 goto martian_source;
1625 if (result)
1626 flags |= RTCF_DIRECTSRC;
1627 spec_dst = daddr;
1628 goto local_input;
1629 }
1630
1631 if (!IN_DEV_FORWARD(in_dev))
1632 goto e_inval;
1633 if (res.type != RTN_UNICAST)
1634 goto martian_destination;
1635
1636#ifdef CONFIG_IP_ROUTE_MULTIPATH
1637 if (res.fi->fib_nhs > 1 && fl.oif == 0)
1638 fib_select_multipath(&fl, &res);
1639#endif
1640 out_dev = in_dev_get(FIB_RES_DEV(res));
1641 if (out_dev == NULL) {
1642 if (net_ratelimit())
1643 printk(KERN_CRIT "Bug in ip_route_input_slow(). "
1644 "Please, report\n");
1645 goto e_inval;
1646 }
1647
1648 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev,
1649 &spec_dst, &itag);
1650 if (err < 0)
1651 goto martian_source;
1652
1653 if (err)
1654 flags |= RTCF_DIRECTSRC;
1655
1656 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1657 (IN_DEV_SHARED_MEDIA(out_dev) ||
1658 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1659 flags |= RTCF_DOREDIRECT;
1660
1661 if (skb->protocol != htons(ETH_P_IP)) {
1662
1663
1664
1665 if (out_dev == in_dev && !(flags & RTCF_DNAT))
1666 goto e_inval;
1667 }
1668
1669 rth = dst_alloc(&ipv4_dst_ops);
1670 if (!rth)
1671 goto e_nobufs;
1672
1673 atomic_set(&rth->u.dst.__refcnt, 1);
1674 rth->u.dst.flags= DST_HOST;
1675 if (in_dev->cnf.no_policy)
1676 rth->u.dst.flags |= DST_NOPOLICY;
1677 if (in_dev->cnf.no_xfrm)
1678 rth->u.dst.flags |= DST_NOXFRM;
1679 rth->fl.fl4_dst = daddr;
1680 rth->rt_dst = daddr;
1681 rth->fl.fl4_tos = tos;
1682#ifdef CONFIG_IP_ROUTE_FWMARK
1683 rth->fl.fl4_fwmark= skb->nfmark;
1684#endif
1685 rth->fl.fl4_src = saddr;
1686 rth->rt_src = saddr;
1687 rth->rt_gateway = daddr;
1688#ifdef CONFIG_IP_ROUTE_NAT
1689 rth->rt_src_map = fl.fl4_src;
1690 rth->rt_dst_map = fl.fl4_dst;
1691 if (flags&RTCF_DNAT)
1692 rth->rt_gateway = fl.fl4_dst;
1693#endif
1694 rth->rt_iif =
1695 rth->fl.iif = dev->ifindex;
1696 rth->u.dst.dev = out_dev->dev;
1697 dev_hold(rth->u.dst.dev);
1698 rth->fl.oif = 0;
1699 rth->rt_spec_dst= spec_dst;
1700
1701 rth->u.dst.input = ip_forward;
1702 rth->u.dst.output = ip_output;
1703
1704 rt_set_nexthop(rth, &res, itag);
1705
1706 rth->rt_flags = flags;
1707
1708#ifdef CONFIG_NET_FASTROUTE
1709 if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
1710 struct net_device *odev = rth->u.dst.dev;
1711 if (odev != dev &&
1712 dev->accept_fastpath &&
1713 odev->mtu >= dev->mtu &&
1714 dev->accept_fastpath(dev, &rth->u.dst) == 0)
1715 rth->rt_flags |= RTCF_FAST;
1716 }
1717#endif
1718
1719intern:
1720 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1721done:
1722 in_dev_put(in_dev);
1723 if (out_dev)
1724 in_dev_put(out_dev);
1725 if (free_res)
1726 fib_res_put(&res);
1727out: return err;
1728
1729brd_input:
1730 if (skb->protocol != htons(ETH_P_IP))
1731 goto e_inval;
1732
1733 if (ZERONET(saddr))
1734 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1735 else {
1736 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1737 &itag);
1738 if (err < 0)
1739 goto martian_source;
1740 if (err)
1741 flags |= RTCF_DIRECTSRC;
1742 }
1743 flags |= RTCF_BROADCAST;
1744 res.type = RTN_BROADCAST;
1745 RT_CACHE_STAT_INC(in_brd);
1746
1747local_input:
1748 rth = dst_alloc(&ipv4_dst_ops);
1749 if (!rth)
1750 goto e_nobufs;
1751
1752 rth->u.dst.output= ip_rt_bug;
1753
1754 atomic_set(&rth->u.dst.__refcnt, 1);
1755 rth->u.dst.flags= DST_HOST;
1756 if (in_dev->cnf.no_policy)
1757 rth->u.dst.flags |= DST_NOPOLICY;
1758 rth->fl.fl4_dst = daddr;
1759 rth->rt_dst = daddr;
1760 rth->fl.fl4_tos = tos;
1761#ifdef CONFIG_IP_ROUTE_FWMARK
1762 rth->fl.fl4_fwmark= skb->nfmark;
1763#endif
1764 rth->fl.fl4_src = saddr;
1765 rth->rt_src = saddr;
1766#ifdef CONFIG_IP_ROUTE_NAT
1767 rth->rt_dst_map = fl.fl4_dst;
1768 rth->rt_src_map = fl.fl4_src;
1769#endif
1770#ifdef CONFIG_NET_CLS_ROUTE
1771 rth->u.dst.tclassid = itag;
1772#endif
1773 rth->rt_iif =
1774 rth->fl.iif = dev->ifindex;
1775 rth->u.dst.dev = &loopback_dev;
1776 dev_hold(rth->u.dst.dev);
1777 rth->rt_gateway = daddr;
1778 rth->rt_spec_dst= spec_dst;
1779 rth->u.dst.input= ip_local_deliver;
1780 rth->rt_flags = flags|RTCF_LOCAL;
1781 if (res.type == RTN_UNREACHABLE) {
1782 rth->u.dst.input= ip_error;
1783 rth->u.dst.error= -err;
1784 rth->rt_flags &= ~RTCF_LOCAL;
1785 }
1786 rth->rt_type = res.type;
1787 goto intern;
1788
1789no_route:
1790 RT_CACHE_STAT_INC(in_no_route);
1791 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1792 res.type = RTN_UNREACHABLE;
1793 goto local_input;
1794
1795
1796
1797
1798martian_destination:
1799 RT_CACHE_STAT_INC(in_martian_dst);
1800#ifdef CONFIG_IP_ROUTE_VERBOSE
1801 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1802 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1803 "%u.%u.%u.%u, dev %s\n",
1804 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1805#endif
1806e_inval:
1807 err = -EINVAL;
1808 goto done;
1809
1810e_nobufs:
1811 err = -ENOBUFS;
1812 goto done;
1813
1814martian_source:
1815
1816 RT_CACHE_STAT_INC(in_martian_src);
1817#ifdef CONFIG_IP_ROUTE_VERBOSE
1818 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1819
1820
1821
1822
1823 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1824 "%u.%u.%u.%u, on dev %s\n",
1825 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1826 if (dev->hard_header_len) {
1827 int i;
1828 unsigned char *p = skb->mac.raw;
1829 printk(KERN_WARNING "ll header: ");
1830 for (i = 0; i < dev->hard_header_len; i++, p++) {
1831 printk("%02x", *p);
1832 if (i < (dev->hard_header_len - 1))
1833 printk(":");
1834 }
1835 printk("\n");
1836 }
1837 }
1838#endif
1839 goto e_inval;
1840}
1841
1842int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1843 u8 tos, struct net_device *dev)
1844{
1845 struct rtable * rth;
1846 unsigned hash;
1847 int iif = dev->ifindex;
1848
1849 tos &= IPTOS_RT_MASK;
1850 hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
1851
1852 rcu_read_lock();
1853 for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
1854 smp_read_barrier_depends();
1855 if (rth->fl.fl4_dst == daddr &&
1856 rth->fl.fl4_src == saddr &&
1857 rth->fl.iif == iif &&
1858 rth->fl.oif == 0 &&
1859#ifdef CONFIG_IP_ROUTE_FWMARK
1860 rth->fl.fl4_fwmark == skb->nfmark &&
1861#endif
1862 rth->fl.fl4_tos == tos) {
1863 rth->u.dst.lastuse = jiffies;
1864 dst_hold(&rth->u.dst);
1865 rth->u.dst.__use++;
1866 RT_CACHE_STAT_INC(in_hit);
1867 rcu_read_unlock();
1868 skb->dst = (struct dst_entry*)rth;
1869 return 0;
1870 }
1871 RT_CACHE_STAT_INC(in_hlist_search);
1872 }
1873 rcu_read_unlock();
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886 if (MULTICAST(daddr)) {
1887 struct in_device *in_dev;
1888
1889 read_lock(&inetdev_lock);
1890 if ((in_dev = __in_dev_get(dev)) != NULL) {
1891 int our = ip_check_mc(in_dev, daddr, saddr,
1892 skb->nh.iph->protocol);
1893 if (our
1894#ifdef CONFIG_IP_MROUTE
1895 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1896#endif
1897 ) {
1898 read_unlock(&inetdev_lock);
1899 return ip_route_input_mc(skb, daddr, saddr,
1900 tos, dev, our);
1901 }
1902 }
1903 read_unlock(&inetdev_lock);
1904 return -EINVAL;
1905 }
1906 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1907}
1908
1909
1910
1911
1912
1913int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
1914{
1915 u32 tos = oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK);
1916 struct flowi fl = { .nl_u = { .ip4_u =
1917 { .daddr = oldflp->fl4_dst,
1918 .saddr = oldflp->fl4_src,
1919 .tos = tos & IPTOS_RT_MASK,
1920 .scope = ((tos & RTO_ONLINK) ?
1921 RT_SCOPE_LINK :
1922 RT_SCOPE_UNIVERSE),
1923#ifdef CONFIG_IP_ROUTE_FWMARK
1924 .fwmark = oldflp->fl4_fwmark
1925#endif
1926 } },
1927 .iif = loopback_dev.ifindex,
1928 .oif = oldflp->oif };
1929 struct fib_result res;
1930 unsigned flags = 0;
1931 struct rtable *rth;
1932 struct net_device *dev_out = NULL;
1933 struct in_device *in_dev = NULL;
1934 unsigned hash;
1935 int free_res = 0;
1936 int err;
1937
1938 res.fi = NULL;
1939#ifdef CONFIG_IP_MULTIPLE_TABLES
1940 res.r = NULL;
1941#endif
1942
1943 if (oldflp->fl4_src) {
1944 err = -EINVAL;
1945 if (MULTICAST(oldflp->fl4_src) ||
1946 BADCLASS(oldflp->fl4_src) ||
1947 ZERONET(oldflp->fl4_src))
1948 goto out;
1949
1950
1951 dev_out = ip_dev_find(oldflp->fl4_src);
1952 if (dev_out == NULL)
1953 goto out;
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963 if (oldflp->oif == 0
1964 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980 fl.oif = dev_out->ifindex;
1981 goto make_route;
1982 }
1983 if (dev_out)
1984 dev_put(dev_out);
1985 dev_out = NULL;
1986 }
1987 if (oldflp->oif) {
1988 dev_out = dev_get_by_index(oldflp->oif);
1989 err = -ENODEV;
1990 if (dev_out == NULL)
1991 goto out;
1992 if (__in_dev_get(dev_out) == NULL) {
1993 dev_put(dev_out);
1994 goto out;
1995 }
1996
1997 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
1998 if (!fl.fl4_src)
1999 fl.fl4_src = inet_select_addr(dev_out, 0,
2000 RT_SCOPE_LINK);
2001 goto make_route;
2002 }
2003 if (!fl.fl4_src) {
2004 if (MULTICAST(oldflp->fl4_dst))
2005 fl.fl4_src = inet_select_addr(dev_out, 0,
2006 fl.fl4_scope);
2007 else if (!oldflp->fl4_dst)
2008 fl.fl4_src = inet_select_addr(dev_out, 0,
2009 RT_SCOPE_HOST);
2010 }
2011 }
2012
2013 if (!fl.fl4_dst) {
2014 fl.fl4_dst = fl.fl4_src;
2015 if (!fl.fl4_dst)
2016 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2017 if (dev_out)
2018 dev_put(dev_out);
2019 dev_out = &loopback_dev;
2020 dev_hold(dev_out);
2021 fl.oif = loopback_dev.ifindex;
2022 res.type = RTN_LOCAL;
2023 flags |= RTCF_LOCAL;
2024 goto make_route;
2025 }
2026
2027 if (fib_lookup(&fl, &res)) {
2028 res.fi = NULL;
2029 if (oldflp->oif) {
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048 if (fl.fl4_src == 0)
2049 fl.fl4_src = inet_select_addr(dev_out, 0,
2050 RT_SCOPE_LINK);
2051 res.type = RTN_UNICAST;
2052 goto make_route;
2053 }
2054 if (dev_out)
2055 dev_put(dev_out);
2056 err = -ENETUNREACH;
2057 goto out;
2058 }
2059 free_res = 1;
2060
2061 if (res.type == RTN_NAT)
2062 goto e_inval;
2063
2064 if (res.type == RTN_LOCAL) {
2065 if (!fl.fl4_src)
2066 fl.fl4_src = fl.fl4_dst;
2067 if (dev_out)
2068 dev_put(dev_out);
2069 dev_out = &loopback_dev;
2070 dev_hold(dev_out);
2071 fl.oif = dev_out->ifindex;
2072 if (res.fi)
2073 fib_info_put(res.fi);
2074 res.fi = NULL;
2075 flags |= RTCF_LOCAL;
2076 goto make_route;
2077 }
2078
2079#ifdef CONFIG_IP_ROUTE_MULTIPATH
2080 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2081 fib_select_multipath(&fl, &res);
2082 else
2083#endif
2084 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2085 fib_select_default(&fl, &res);
2086
2087 if (!fl.fl4_src)
2088 fl.fl4_src = FIB_RES_PREFSRC(res);
2089
2090 if (dev_out)
2091 dev_put(dev_out);
2092 dev_out = FIB_RES_DEV(res);
2093 dev_hold(dev_out);
2094 fl.oif = dev_out->ifindex;
2095
2096make_route:
2097 if (LOOPBACK(fl.fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2098 goto e_inval;
2099
2100 if (fl.fl4_dst == 0xFFFFFFFF)
2101 res.type = RTN_BROADCAST;
2102 else if (MULTICAST(fl.fl4_dst))
2103 res.type = RTN_MULTICAST;
2104 else if (BADCLASS(fl.fl4_dst) || ZERONET(fl.fl4_dst))
2105 goto e_inval;
2106
2107 if (dev_out->flags & IFF_LOOPBACK)
2108 flags |= RTCF_LOCAL;
2109
2110 in_dev = in_dev_get(dev_out);
2111 if (!in_dev)
2112 goto e_inval;
2113
2114 if (res.type == RTN_BROADCAST) {
2115 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2116 if (res.fi) {
2117 fib_info_put(res.fi);
2118 res.fi = NULL;
2119 }
2120 } else if (res.type == RTN_MULTICAST) {
2121 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2122 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, oldflp->proto))
2123 flags &= ~RTCF_LOCAL;
2124
2125
2126
2127
2128 if (res.fi && res.prefixlen < 4) {
2129 fib_info_put(res.fi);
2130 res.fi = NULL;
2131 }
2132 }
2133
2134 rth = dst_alloc(&ipv4_dst_ops);
2135 if (!rth)
2136 goto e_nobufs;
2137
2138 atomic_set(&rth->u.dst.__refcnt, 1);
2139 rth->u.dst.flags= DST_HOST;
2140 if (in_dev->cnf.no_xfrm)
2141 rth->u.dst.flags |= DST_NOXFRM;
2142 if (in_dev->cnf.no_policy)
2143 rth->u.dst.flags |= DST_NOPOLICY;
2144 rth->fl.fl4_dst = oldflp->fl4_dst;
2145 rth->fl.fl4_tos = tos;
2146 rth->fl.fl4_src = oldflp->fl4_src;
2147 rth->fl.oif = oldflp->oif;
2148#ifdef CONFIG_IP_ROUTE_FWMARK
2149 rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2150#endif
2151 rth->rt_dst = fl.fl4_dst;
2152 rth->rt_src = fl.fl4_src;
2153#ifdef CONFIG_IP_ROUTE_NAT
2154 rth->rt_dst_map = fl.fl4_dst;
2155 rth->rt_src_map = fl.fl4_src;
2156#endif
2157 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2158 rth->u.dst.dev = dev_out;
2159 dev_hold(dev_out);
2160 rth->rt_gateway = fl.fl4_dst;
2161 rth->rt_spec_dst= fl.fl4_src;
2162
2163 rth->u.dst.output=ip_output;
2164
2165 RT_CACHE_STAT_INC(out_slow_tot);
2166
2167 if (flags & RTCF_LOCAL) {
2168 rth->u.dst.input = ip_local_deliver;
2169 rth->rt_spec_dst = fl.fl4_dst;
2170 }
2171 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2172 rth->rt_spec_dst = fl.fl4_src;
2173 if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) {
2174 rth->u.dst.output = ip_mc_output;
2175 RT_CACHE_STAT_INC(out_slow_mc);
2176 }
2177#ifdef CONFIG_IP_MROUTE
2178 if (res.type == RTN_MULTICAST) {
2179 if (IN_DEV_MFORWARD(in_dev) &&
2180 !LOCAL_MCAST(oldflp->fl4_dst)) {
2181 rth->u.dst.input = ip_mr_input;
2182 rth->u.dst.output = ip_mc_output;
2183 }
2184 }
2185#endif
2186 }
2187
2188 rt_set_nexthop(rth, &res, 0);
2189
2190
2191 rth->rt_flags = flags;
2192
2193 hash = rt_hash_code(oldflp->fl4_dst, oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2194 err = rt_intern_hash(hash, rth, rp);
2195done:
2196 if (free_res)
2197 fib_res_put(&res);
2198 if (dev_out)
2199 dev_put(dev_out);
2200 if (in_dev)
2201 in_dev_put(in_dev);
2202out: return err;
2203
2204e_inval:
2205 err = -EINVAL;
2206 goto done;
2207e_nobufs:
2208 err = -ENOBUFS;
2209 goto done;
2210}
2211
2212int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2213{
2214 unsigned hash;
2215 struct rtable *rth;
2216
2217 hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2218
2219 rcu_read_lock();
2220 for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
2221 smp_read_barrier_depends();
2222 if (rth->fl.fl4_dst == flp->fl4_dst &&
2223 rth->fl.fl4_src == flp->fl4_src &&
2224 rth->fl.iif == 0 &&
2225 rth->fl.oif == flp->oif &&
2226#ifdef CONFIG_IP_ROUTE_FWMARK
2227 rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2228#endif
2229 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2230 (IPTOS_RT_MASK | RTO_ONLINK))) {
2231 rth->u.dst.lastuse = jiffies;
2232 dst_hold(&rth->u.dst);
2233 rth->u.dst.__use++;
2234 RT_CACHE_STAT_INC(out_hit);
2235 rcu_read_unlock();
2236 *rp = rth;
2237 return 0;
2238 }
2239 RT_CACHE_STAT_INC(out_hlist_search);
2240 }
2241 rcu_read_unlock();
2242
2243 return ip_route_output_slow(rp, flp);
2244}
2245
2246int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2247{
2248 int err;
2249
2250 if ((err = __ip_route_output_key(rp, flp)) != 0)
2251 return err;
2252 return flp->proto ? xfrm_lookup((struct dst_entry**)rp, flp, NULL, 0) : 0;
2253}
2254
2255int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2256{
2257 int err;
2258
2259 if ((err = __ip_route_output_key(rp, flp)) != 0)
2260 return err;
2261 return flp->proto ? xfrm_lookup((struct dst_entry**)rp, flp, sk, flags) : 0;
2262}
2263
2264static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2265 int nowait)
2266{
2267 struct rtable *rt = (struct rtable*)skb->dst;
2268 struct rtmsg *r;
2269 struct nlmsghdr *nlh;
2270 unsigned char *b = skb->tail;
2271 struct rta_cacheinfo ci;
2272#ifdef CONFIG_IP_MROUTE
2273 struct rtattr *eptr;
2274#endif
2275 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
2276 r = NLMSG_DATA(nlh);
2277 nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2278 r->rtm_family = AF_INET;
2279 r->rtm_dst_len = 32;
2280 r->rtm_src_len = 0;
2281 r->rtm_tos = rt->fl.fl4_tos;
2282 r->rtm_table = RT_TABLE_MAIN;
2283 r->rtm_type = rt->rt_type;
2284 r->rtm_scope = RT_SCOPE_UNIVERSE;
2285 r->rtm_protocol = RTPROT_UNSPEC;
2286 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2287 if (rt->rt_flags & RTCF_NOTIFY)
2288 r->rtm_flags |= RTM_F_NOTIFY;
2289 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2290 if (rt->fl.fl4_src) {
2291 r->rtm_src_len = 32;
2292 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2293 }
2294 if (rt->u.dst.dev)
2295 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2296#ifdef CONFIG_NET_CLS_ROUTE
2297 if (rt->u.dst.tclassid)
2298 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2299#endif
2300 if (rt->fl.iif)
2301 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2302 else if (rt->rt_src != rt->fl.fl4_src)
2303 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2304 if (rt->rt_dst != rt->rt_gateway)
2305 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2306 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2307 goto rtattr_failure;
2308 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2309 ci.rta_used = rt->u.dst.__use;
2310 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2311 if (rt->u.dst.expires)
2312 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2313 else
2314 ci.rta_expires = 0;
2315 ci.rta_error = rt->u.dst.error;
2316 ci.rta_id = ci.rta_ts = ci.rta_tsage = 0;
2317 if (rt->peer) {
2318 ci.rta_id = rt->peer->ip_id_count;
2319 if (rt->peer->tcp_ts_stamp) {
2320 ci.rta_ts = rt->peer->tcp_ts;
2321 ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2322 }
2323 }
2324#ifdef CONFIG_IP_MROUTE
2325 eptr = (struct rtattr*)skb->tail;
2326#endif
2327 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2328 if (rt->fl.iif) {
2329#ifdef CONFIG_IP_MROUTE
2330 u32 dst = rt->rt_dst;
2331
2332 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2333 ipv4_devconf.mc_forwarding) {
2334 int err = ipmr_get_route(skb, r, nowait);
2335 if (err <= 0) {
2336 if (!nowait) {
2337 if (err == 0)
2338 return 0;
2339 goto nlmsg_failure;
2340 } else {
2341 if (err == -EMSGSIZE)
2342 goto nlmsg_failure;
2343 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2344 }
2345 }
2346 } else
2347#endif
2348 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2349 }
2350
2351 nlh->nlmsg_len = skb->tail - b;
2352 return skb->len;
2353
2354nlmsg_failure:
2355rtattr_failure:
2356 skb_trim(skb, b - skb->data);
2357 return -1;
2358}
2359
2360int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2361{
2362 struct rtattr **rta = arg;
2363 struct rtmsg *rtm = NLMSG_DATA(nlh);
2364 struct rtable *rt = NULL;
2365 u32 dst = 0;
2366 u32 src = 0;
2367 int iif = 0;
2368 int err = -ENOBUFS;
2369 struct sk_buff *skb;
2370
2371 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2372 if (!skb)
2373 goto out;
2374
2375
2376
2377
2378 skb->mac.raw = skb->data;
2379 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2380
2381 if (rta[RTA_SRC - 1])
2382 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2383 if (rta[RTA_DST - 1])
2384 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2385 if (rta[RTA_IIF - 1])
2386 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2387
2388 if (iif) {
2389 struct net_device *dev = __dev_get_by_index(iif);
2390 err = -ENODEV;
2391 if (!dev)
2392 goto out_free;
2393 skb->protocol = htons(ETH_P_IP);
2394 skb->dev = dev;
2395 local_bh_disable();
2396 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2397 local_bh_enable();
2398 rt = (struct rtable*)skb->dst;
2399 if (!err && rt->u.dst.error)
2400 err = -rt->u.dst.error;
2401 } else {
2402 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2403 .saddr = src,
2404 .tos = rtm->rtm_tos } } };
2405 int oif = 0;
2406 if (rta[RTA_OIF - 1])
2407 memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2408 fl.oif = oif;
2409 err = ip_route_output_key(&rt, &fl);
2410 }
2411 if (err)
2412 goto out_free;
2413
2414 skb->dst = &rt->u.dst;
2415 if (rtm->rtm_flags & RTM_F_NOTIFY)
2416 rt->rt_flags |= RTCF_NOTIFY;
2417
2418 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2419
2420 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2421 RTM_NEWROUTE, 0);
2422 if (!err)
2423 goto out_free;
2424 if (err < 0) {
2425 err = -EMSGSIZE;
2426 goto out_free;
2427 }
2428
2429 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2430 if (err > 0)
2431 err = 0;
2432out: return err;
2433
2434out_free:
2435 kfree_skb(skb);
2436 goto out;
2437}
2438
2439int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2440{
2441 struct rtable *rt;
2442 int h, s_h;
2443 int idx, s_idx;
2444
2445 s_h = cb->args[0];
2446 s_idx = idx = cb->args[1];
2447 for (h = 0; h <= rt_hash_mask; h++) {
2448 if (h < s_h) continue;
2449 if (h > s_h)
2450 s_idx = 0;
2451 rcu_read_lock();
2452 for (rt = rt_hash_table[h].chain, idx = 0; rt;
2453 rt = rt->u.rt_next, idx++) {
2454 smp_read_barrier_depends();
2455 if (idx < s_idx)
2456 continue;
2457 skb->dst = dst_clone(&rt->u.dst);
2458 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2459 cb->nlh->nlmsg_seq,
2460 RTM_NEWROUTE, 1) <= 0) {
2461 dst_release(xchg(&skb->dst, NULL));
2462 rcu_read_unlock();
2463 goto done;
2464 }
2465 dst_release(xchg(&skb->dst, NULL));
2466 }
2467 rcu_read_unlock();
2468 }
2469
2470done:
2471 cb->args[0] = h;
2472 cb->args[1] = idx;
2473 return skb->len;
2474}
2475
2476void ip_rt_multicast_event(struct in_device *in_dev)
2477{
2478 rt_cache_flush(0);
2479}
2480
2481#ifdef CONFIG_SYSCTL
2482static int flush_delay;
2483
2484static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2485 struct file *filp, void *buffer,
2486 size_t *lenp)
2487{
2488 if (write) {
2489 proc_dointvec(ctl, write, filp, buffer, lenp);
2490 rt_cache_flush(flush_delay);
2491 return 0;
2492 }
2493
2494 return -EINVAL;
2495}
2496
2497static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, int *name,
2498 int nlen, void *oldval,
2499 size_t *oldlenp, void *newval,
2500 size_t newlen, void **context)
2501{
2502 int delay;
2503 if (newlen != sizeof(int))
2504 return -EINVAL;
2505 if (get_user(delay, (int *)newval))
2506 return -EFAULT;
2507 rt_cache_flush(delay);
2508 return 0;
2509}
2510
2511ctl_table ipv4_route_table[] = {
2512 {
2513 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2514 .procname = "flush",
2515 .data = &flush_delay,
2516 .maxlen = sizeof(int),
2517 .mode = 0644,
2518 .proc_handler = &ipv4_sysctl_rtcache_flush,
2519 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2520 },
2521 {
2522 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2523 .procname = "min_delay",
2524 .data = &ip_rt_min_delay,
2525 .maxlen = sizeof(int),
2526 .mode = 0644,
2527 .proc_handler = &proc_dointvec_jiffies,
2528 .strategy = &sysctl_jiffies,
2529 },
2530 {
2531 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2532 .procname = "max_delay",
2533 .data = &ip_rt_max_delay,
2534 .maxlen = sizeof(int),
2535 .mode = 0644,
2536 .proc_handler = &proc_dointvec_jiffies,
2537 .strategy = &sysctl_jiffies,
2538 },
2539 {
2540 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2541 .procname = "gc_thresh",
2542 .data = &ipv4_dst_ops.gc_thresh,
2543 .maxlen = sizeof(int),
2544 .mode = 0644,
2545 .proc_handler = &proc_dointvec,
2546 },
2547 {
2548 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2549 .procname = "max_size",
2550 .data = &ip_rt_max_size,
2551 .maxlen = sizeof(int),
2552 .mode = 0644,
2553 .proc_handler = &proc_dointvec,
2554 },
2555 {
2556 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2557 .procname = "gc_min_interval",
2558 .data = &ip_rt_gc_min_interval,
2559 .maxlen = sizeof(int),
2560 .mode = 0644,
2561 .proc_handler = &proc_dointvec_jiffies,
2562 .strategy = &sysctl_jiffies,
2563 },
2564 {
2565 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2566 .procname = "gc_timeout",
2567 .data = &ip_rt_gc_timeout,
2568 .maxlen = sizeof(int),
2569 .mode = 0644,
2570 .proc_handler = &proc_dointvec_jiffies,
2571 .strategy = &sysctl_jiffies,
2572 },
2573 {
2574 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2575 .procname = "gc_interval",
2576 .data = &ip_rt_gc_interval,
2577 .maxlen = sizeof(int),
2578 .mode = 0644,
2579 .proc_handler = &proc_dointvec_jiffies,
2580 .strategy = &sysctl_jiffies,
2581 },
2582 {
2583 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2584 .procname = "redirect_load",
2585 .data = &ip_rt_redirect_load,
2586 .maxlen = sizeof(int),
2587 .mode = 0644,
2588 .proc_handler = &proc_dointvec,
2589 },
2590 {
2591 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2592 .procname = "redirect_number",
2593 .data = &ip_rt_redirect_number,
2594 .maxlen = sizeof(int),
2595 .mode = 0644,
2596 .proc_handler = &proc_dointvec,
2597 },
2598 {
2599 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2600 .procname = "redirect_silence",
2601 .data = &ip_rt_redirect_silence,
2602 .maxlen = sizeof(int),
2603 .mode = 0644,
2604 .proc_handler = &proc_dointvec,
2605 },
2606 {
2607 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2608 .procname = "error_cost",
2609 .data = &ip_rt_error_cost,
2610 .maxlen = sizeof(int),
2611 .mode = 0644,
2612 .proc_handler = &proc_dointvec,
2613 },
2614 {
2615 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2616 .procname = "error_burst",
2617 .data = &ip_rt_error_burst,
2618 .maxlen = sizeof(int),
2619 .mode = 0644,
2620 .proc_handler = &proc_dointvec,
2621 },
2622 {
2623 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2624 .procname = "gc_elasticity",
2625 .data = &ip_rt_gc_elasticity,
2626 .maxlen = sizeof(int),
2627 .mode = 0644,
2628 .proc_handler = &proc_dointvec,
2629 },
2630 {
2631 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2632 .procname = "mtu_expires",
2633 .data = &ip_rt_mtu_expires,
2634 .maxlen = sizeof(int),
2635 .mode = 0644,
2636 .proc_handler = &proc_dointvec_jiffies,
2637 .strategy = &sysctl_jiffies,
2638 },
2639 {
2640 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2641 .procname = "min_pmtu",
2642 .data = &ip_rt_min_pmtu,
2643 .maxlen = sizeof(int),
2644 .mode = 0644,
2645 .proc_handler = &proc_dointvec,
2646 },
2647 {
2648 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2649 .procname = "min_adv_mss",
2650 .data = &ip_rt_min_advmss,
2651 .maxlen = sizeof(int),
2652 .mode = 0644,
2653 .proc_handler = &proc_dointvec,
2654 },
2655 {
2656 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
2657 .procname = "secret_interval",
2658 .data = &ip_rt_secret_interval,
2659 .maxlen = sizeof(int),
2660 .mode = 0644,
2661 .proc_handler = &proc_dointvec_jiffies,
2662 .strategy = &sysctl_jiffies,
2663 },
2664 { .ctl_name = 0 }
2665};
2666#endif
2667
2668#ifdef CONFIG_NET_CLS_ROUTE
2669struct ip_rt_acct *ip_rt_acct;
2670
2671
2672
2673
2674#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2675
2676#ifdef CONFIG_PROC_FS
2677static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2678 int length, int *eof, void *data)
2679{
2680 unsigned int i;
2681
2682 if ((offset & 3) || (length & 3))
2683 return -EIO;
2684
2685 if (offset >= sizeof(struct ip_rt_acct) * 256) {
2686 *eof = 1;
2687 return 0;
2688 }
2689
2690 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2691 length = sizeof(struct ip_rt_acct) * 256 - offset;
2692 *eof = 1;
2693 }
2694
2695 offset /= sizeof(u32);
2696
2697 if (length > 0) {
2698 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2699 u32 *dst = (u32 *) buffer;
2700
2701
2702 *start = buffer;
2703 memcpy(dst, src, length);
2704
2705
2706 for (i = 1; i < NR_CPUS; i++) {
2707 unsigned int j;
2708
2709 if (!cpu_online(i))
2710 continue;
2711
2712 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2713
2714 for (j = 0; j < length/4; j++)
2715 dst[j] += src[j];
2716 }
2717 }
2718 return length;
2719}
2720#endif
2721#endif
2722
2723int __init ip_rt_init(void)
2724{
2725 int i, order, goal, rc = 0;
2726
2727 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2728 (jiffies ^ (jiffies >> 7)));
2729
2730#ifdef CONFIG_NET_CLS_ROUTE
2731 for (order = 0;
2732 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2733 ;
2734 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2735 if (!ip_rt_acct)
2736 panic("IP: failed to allocate ip_rt_acct\n");
2737 memset(ip_rt_acct, 0, PAGE_SIZE << order);
2738#endif
2739
2740 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
2741 sizeof(struct rtable),
2742 0, SLAB_HWCACHE_ALIGN,
2743 NULL, NULL);
2744
2745 if (!ipv4_dst_ops.kmem_cachep)
2746 panic("IP: failed to allocate ip_dst_cache\n");
2747
2748 goal = num_physpages >> (26 - PAGE_SHIFT);
2749
2750 for (order = 0; (1UL << order) < goal; order++)
2751 ;
2752
2753 do {
2754 rt_hash_mask = (1UL << order) * PAGE_SIZE /
2755 sizeof(struct rt_hash_bucket);
2756 while (rt_hash_mask & (rt_hash_mask - 1))
2757 rt_hash_mask--;
2758 rt_hash_table = (struct rt_hash_bucket *)
2759 __get_free_pages(GFP_ATOMIC, order);
2760 } while (rt_hash_table == NULL && --order > 0);
2761
2762 if (!rt_hash_table)
2763 panic("Failed to allocate IP route cache hash table\n");
2764
2765 printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
2766 rt_hash_mask,
2767 (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
2768
2769 for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
2770 ;
2771
2772 rt_hash_mask--;
2773 for (i = 0; i <= rt_hash_mask; i++) {
2774 rt_hash_table[i].lock = SPIN_LOCK_UNLOCKED;
2775 rt_hash_table[i].chain = NULL;
2776 }
2777
2778 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2779 ip_rt_max_size = (rt_hash_mask + 1) * 16;
2780
2781 rt_cache_stat = alloc_percpu(struct rt_cache_stat);
2782 if (!rt_cache_stat)
2783 return -ENOMEM;
2784
2785 devinet_init();
2786 ip_fib_init();
2787
2788 init_timer(&rt_flush_timer);
2789 rt_flush_timer.function = rt_run_flush;
2790 init_timer(&rt_periodic_timer);
2791 rt_periodic_timer.function = rt_check_expire;
2792 init_timer(&rt_secret_timer);
2793 rt_secret_timer.function = rt_secret_rebuild;
2794
2795
2796
2797
2798 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
2799 ip_rt_gc_interval;
2800 add_timer(&rt_periodic_timer);
2801
2802 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2803 ip_rt_secret_interval;
2804 add_timer(&rt_secret_timer);
2805
2806#ifdef CONFIG_PROC_FS
2807 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
2808 !proc_net_fops_create("rt_cache_stat", S_IRUGO, &rt_cpu_seq_fops)) {
2809 free_percpu(rt_cache_stat);
2810 return -ENOMEM;
2811 }
2812
2813#ifdef CONFIG_NET_CLS_ROUTE
2814 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
2815#endif
2816#endif
2817#ifdef CONFIG_XFRM
2818 xfrm_init();
2819 xfrm4_init();
2820#endif
2821 return rc;
2822}
2823
2824EXPORT_SYMBOL(__ip_select_ident);
2825EXPORT_SYMBOL(ip_route_input);
2826EXPORT_SYMBOL(ip_route_output_key);
2827