1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63#include <linux/config.h>
64#include <asm/uaccess.h>
65#include <asm/system.h>
66#include <asm/bitops.h>
67#include <linux/types.h>
68#include <linux/kernel.h>
69#include <linux/sched.h>
70#include <linux/mm.h>
71#include <linux/string.h>
72#include <linux/socket.h>
73#include <linux/sockios.h>
74#include <linux/errno.h>
75#include <linux/in.h>
76#include <linux/inet.h>
77#include <linux/netdevice.h>
78#include <linux/proc_fs.h>
79#include <linux/init.h>
80#include <linux/skbuff.h>
81#include <linux/rtnetlink.h>
82#include <linux/inetdevice.h>
83#include <linux/igmp.h>
84#include <linux/pkt_sched.h>
85#include <linux/mroute.h>
86#include <linux/netfilter_ipv4.h>
87#include <linux/random.h>
88#include <linux/jhash.h>
89#include <net/protocol.h>
90#include <net/ip.h>
91#include <net/route.h>
92#include <net/inetpeer.h>
93#include <net/sock.h>
94#include <net/ip_fib.h>
95#include <net/arp.h>
96#include <net/tcp.h>
97#include <net/icmp.h>
98#ifdef CONFIG_SYSCTL
99#include <linux/sysctl.h>
100#endif
101
102#define IP_MAX_MTU 0xFFF0
103
104#define RT_GC_TIMEOUT (300*HZ)
105
106int ip_rt_min_delay = 2 * HZ;
107int ip_rt_max_delay = 10 * HZ;
108int ip_rt_max_size;
109int ip_rt_gc_timeout = RT_GC_TIMEOUT;
110int ip_rt_gc_interval = 60 * HZ;
111int ip_rt_gc_min_interval = HZ / 2;
112int ip_rt_redirect_number = 9;
113int ip_rt_redirect_load = HZ / 50;
114int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
115int ip_rt_error_cost = HZ;
116int ip_rt_error_burst = 5 * HZ;
117int ip_rt_gc_elasticity = 8;
118int ip_rt_mtu_expires = 10 * 60 * HZ;
119int ip_rt_min_pmtu = 512 + 20 + 20;
120int ip_rt_min_advmss = 256;
121int ip_rt_secret_interval = 10 * 60 * HZ;
122static unsigned long rt_deadline;
123
124#define RTprint(a...) printk(KERN_DEBUG a)
125
126static struct timer_list rt_flush_timer;
127static struct timer_list rt_periodic_timer;
128static struct timer_list rt_secret_timer;
129
130
131
132
133
134static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
135static struct dst_entry *ipv4_dst_reroute(struct dst_entry *dst,
136 struct sk_buff *skb);
137static void ipv4_dst_destroy(struct dst_entry *dst);
138static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
139static void ipv4_link_failure(struct sk_buff *skb);
140static int rt_garbage_collect(void);
141
142
143struct dst_ops ipv4_dst_ops = {
144 family: AF_INET,
145 protocol: __constant_htons(ETH_P_IP),
146 gc: rt_garbage_collect,
147 check: ipv4_dst_check,
148 reroute: ipv4_dst_reroute,
149 destroy: ipv4_dst_destroy,
150 negative_advice: ipv4_negative_advice,
151 link_failure: ipv4_link_failure,
152 entry_size: sizeof(struct rtable),
153};
154
155#define ECN_OR_COST(class) TC_PRIO_##class
156
157__u8 ip_tos2prio[16] = {
158 TC_PRIO_BESTEFFORT,
159 ECN_OR_COST(FILLER),
160 TC_PRIO_BESTEFFORT,
161 ECN_OR_COST(BESTEFFORT),
162 TC_PRIO_BULK,
163 ECN_OR_COST(BULK),
164 TC_PRIO_BULK,
165 ECN_OR_COST(BULK),
166 TC_PRIO_INTERACTIVE,
167 ECN_OR_COST(INTERACTIVE),
168 TC_PRIO_INTERACTIVE,
169 ECN_OR_COST(INTERACTIVE),
170 TC_PRIO_INTERACTIVE_BULK,
171 ECN_OR_COST(INTERACTIVE_BULK),
172 TC_PRIO_INTERACTIVE_BULK,
173 ECN_OR_COST(INTERACTIVE_BULK)
174};
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191struct rt_hash_bucket {
192 struct rtable *chain;
193 rwlock_t lock;
194} __attribute__((__aligned__(8)));
195
196static struct rt_hash_bucket *rt_hash_table;
197static unsigned rt_hash_mask;
198static int rt_hash_log;
199static unsigned int rt_hash_rnd;
200
201struct rt_cache_stat rt_cache_stat[NR_CPUS];
202
203static int rt_intern_hash(unsigned hash, struct rtable *rth,
204 struct rtable **res);
205
206static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
207{
208 return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
209 & rt_hash_mask);
210}
211
212static int rt_cache_get_info(char *buffer, char **start, off_t offset,
213 int length)
214{
215 int len = 0;
216 off_t pos = 128;
217 char temp[256];
218 struct rtable *r;
219 int i;
220
221 if (offset < 128) {
222 sprintf(buffer, "%-127s\n",
223 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225 "HHUptod\tSpecDst");
226 len = 128;
227 }
228
229 for (i = rt_hash_mask; i >= 0; i--) {
230 read_lock_bh(&rt_hash_table[i].lock);
231 for (r = rt_hash_table[i].chain; r; r = r->u.rt_next) {
232
233
234
235 pos += 128;
236
237 if (pos <= offset) {
238 len = 0;
239 continue;
240 }
241 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
242 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
243 r->u.dst.dev ? r->u.dst.dev->name : "*",
244 (unsigned long)r->rt_dst,
245 (unsigned long)r->rt_gateway,
246 r->rt_flags,
247 atomic_read(&r->u.dst.__refcnt),
248 r->u.dst.__use,
249 0,
250 (unsigned long)r->rt_src,
251 (r->u.dst.advmss ?
252 (int) r->u.dst.advmss + 40 : 0),
253 r->u.dst.window,
254 (int)((r->u.dst.rtt >> 3) + r->u.dst.rttvar),
255 r->key.tos,
256 r->u.dst.hh ?
257 atomic_read(&r->u.dst.hh->hh_refcnt) :
258 -1,
259 r->u.dst.hh ?
260 (r->u.dst.hh->hh_output ==
261 dev_queue_xmit) : 0,
262 r->rt_spec_dst);
263 sprintf(buffer + len, "%-127s\n", temp);
264 len += 128;
265 if (pos >= offset+length) {
266 read_unlock_bh(&rt_hash_table[i].lock);
267 goto done;
268 }
269 }
270 read_unlock_bh(&rt_hash_table[i].lock);
271 }
272
273done:
274 *start = buffer + len - (pos - offset);
275 len = pos - offset;
276 if (len > length)
277 len = length;
278 return len;
279}
280
281static int rt_cache_stat_get_info(char *buffer, char **start, off_t offset, int length)
282{
283 unsigned int dst_entries = atomic_read(&ipv4_dst_ops.entries);
284 int i, lcpu;
285 int len = 0;
286
287 len += sprintf(buffer+len, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
288 for (lcpu = 0; lcpu < smp_num_cpus; lcpu++) {
289 i = cpu_logical_map(lcpu);
290
291 len += sprintf(buffer+len, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
292 dst_entries,
293 rt_cache_stat[i].in_hit,
294 rt_cache_stat[i].in_slow_tot,
295 rt_cache_stat[i].in_slow_mc,
296 rt_cache_stat[i].in_no_route,
297 rt_cache_stat[i].in_brd,
298 rt_cache_stat[i].in_martian_dst,
299 rt_cache_stat[i].in_martian_src,
300
301 rt_cache_stat[i].out_hit,
302 rt_cache_stat[i].out_slow_tot,
303 rt_cache_stat[i].out_slow_mc,
304
305 rt_cache_stat[i].gc_total,
306 rt_cache_stat[i].gc_ignored,
307 rt_cache_stat[i].gc_goal_miss,
308 rt_cache_stat[i].gc_dst_overflow,
309 rt_cache_stat[i].in_hlist_search,
310 rt_cache_stat[i].out_hlist_search
311
312 );
313 }
314 len -= offset;
315
316 if (len > length)
317 len = length;
318 if (len < 0)
319 len = 0;
320
321 *start = buffer + offset;
322 return len;
323}
324
325static __inline__ void rt_free(struct rtable *rt)
326{
327 dst_free(&rt->u.dst);
328}
329
330static __inline__ void rt_drop(struct rtable *rt)
331{
332 ip_rt_put(rt);
333 dst_free(&rt->u.dst);
334}
335
336static __inline__ int rt_fast_clean(struct rtable *rth)
337{
338
339
340 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
341 rth->key.iif && rth->u.rt_next;
342}
343
344static __inline__ int rt_valuable(struct rtable *rth)
345{
346 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
347 rth->u.dst.expires;
348}
349
350static __inline__ int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
351{
352 unsigned long age;
353 int ret = 0;
354
355 if (atomic_read(&rth->u.dst.__refcnt))
356 goto out;
357
358 ret = 1;
359 if (rth->u.dst.expires &&
360 time_after_eq(jiffies, rth->u.dst.expires))
361 goto out;
362
363 age = jiffies - rth->u.dst.lastuse;
364 ret = 0;
365 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
366 (age <= tmo2 && rt_valuable(rth)))
367 goto out;
368 ret = 1;
369out: return ret;
370}
371
372
373
374
375
376
377static inline u32 rt_score(struct rtable *rt)
378{
379 u32 score = jiffies - rt->u.dst.lastuse;
380
381 score = ~score & ~(3<<30);
382
383 if (rt_valuable(rt))
384 score |= (1<<31);
385
386 if (!rt->key.iif ||
387 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
388 score |= (1<<30);
389
390 return score;
391}
392
393
394static void SMP_TIMER_NAME(rt_check_expire)(unsigned long dummy)
395{
396 static int rover;
397 int i = rover, t;
398 struct rtable *rth, **rthp;
399 unsigned long now = jiffies;
400
401 for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
402 t -= ip_rt_gc_timeout) {
403 unsigned long tmo = ip_rt_gc_timeout;
404
405 i = (i + 1) & rt_hash_mask;
406 rthp = &rt_hash_table[i].chain;
407
408 write_lock(&rt_hash_table[i].lock);
409 while ((rth = *rthp) != NULL) {
410 if (rth->u.dst.expires) {
411
412 if (time_before_eq(now, rth->u.dst.expires)) {
413 tmo >>= 1;
414 rthp = &rth->u.rt_next;
415 continue;
416 }
417 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
418 tmo >>= 1;
419 rthp = &rth->u.rt_next;
420 continue;
421 }
422
423
424 *rthp = rth->u.rt_next;
425 rt_free(rth);
426 }
427 write_unlock(&rt_hash_table[i].lock);
428
429
430 if (time_after(jiffies, now))
431 break;
432 }
433 rover = i;
434 mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
435}
436
437SMP_TIMER_DEFINE(rt_check_expire, rt_gc_task);
438
439
440
441
442static void SMP_TIMER_NAME(rt_run_flush)(unsigned long dummy)
443{
444 int i;
445 struct rtable *rth, *next;
446
447 rt_deadline = 0;
448
449 get_random_bytes(&rt_hash_rnd, 4);
450
451 for (i = rt_hash_mask; i >= 0; i--) {
452 write_lock_bh(&rt_hash_table[i].lock);
453 rth = rt_hash_table[i].chain;
454 if (rth)
455 rt_hash_table[i].chain = NULL;
456 write_unlock_bh(&rt_hash_table[i].lock);
457
458 for (; rth; rth = next) {
459 next = rth->u.rt_next;
460 rt_free(rth);
461 }
462 }
463}
464
465SMP_TIMER_DEFINE(rt_run_flush, rt_cache_flush_task);
466
467static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
468
469void rt_cache_flush(int delay)
470{
471 unsigned long now = jiffies;
472 int user_mode = !in_softirq();
473
474 if (delay < 0)
475 delay = ip_rt_min_delay;
476
477 spin_lock_bh(&rt_flush_lock);
478
479 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
480 long tmo = (long)(rt_deadline - now);
481
482
483
484
485
486
487
488
489 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
490 tmo = 0;
491
492 if (delay > tmo)
493 delay = tmo;
494 }
495
496 if (delay <= 0) {
497 spin_unlock_bh(&rt_flush_lock);
498 SMP_TIMER_NAME(rt_run_flush)(0);
499 return;
500 }
501
502 if (rt_deadline == 0)
503 rt_deadline = now + ip_rt_max_delay;
504
505 mod_timer(&rt_flush_timer, now+delay);
506 spin_unlock_bh(&rt_flush_lock);
507}
508
509static void rt_secret_rebuild(unsigned long dummy)
510{
511 unsigned long now = jiffies;
512
513 rt_cache_flush(0);
514 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
515}
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530static int rt_garbage_collect(void)
531{
532 static unsigned long expire = RT_GC_TIMEOUT;
533 static unsigned long last_gc;
534 static int rover;
535 static int equilibrium;
536 struct rtable *rth, **rthp;
537 unsigned long now = jiffies;
538 int goal;
539
540
541
542
543
544
545 rt_cache_stat[smp_processor_id()].gc_total++;
546
547 if (now - last_gc < ip_rt_gc_min_interval &&
548 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
549 rt_cache_stat[smp_processor_id()].gc_ignored++;
550 goto out;
551 }
552
553
554 goal = atomic_read(&ipv4_dst_ops.entries) -
555 (ip_rt_gc_elasticity << rt_hash_log);
556 if (goal <= 0) {
557 if (equilibrium < ipv4_dst_ops.gc_thresh)
558 equilibrium = ipv4_dst_ops.gc_thresh;
559 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
560 if (goal > 0) {
561 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
562 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
563 }
564 } else {
565
566
567
568 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
569 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
570 }
571
572 if (now - last_gc >= ip_rt_gc_min_interval)
573 last_gc = now;
574
575 if (goal <= 0) {
576 equilibrium += goal;
577 goto work_done;
578 }
579
580 do {
581 int i, k;
582
583 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
584 unsigned long tmo = expire;
585
586 k = (k + 1) & rt_hash_mask;
587 rthp = &rt_hash_table[k].chain;
588 write_lock_bh(&rt_hash_table[k].lock);
589 while ((rth = *rthp) != NULL) {
590 if (!rt_may_expire(rth, tmo, expire)) {
591 tmo >>= 1;
592 rthp = &rth->u.rt_next;
593 continue;
594 }
595 *rthp = rth->u.rt_next;
596 rt_free(rth);
597 goal--;
598 }
599 write_unlock_bh(&rt_hash_table[k].lock);
600 if (goal <= 0)
601 break;
602 }
603 rover = k;
604
605 if (goal <= 0)
606 goto work_done;
607
608
609
610
611
612
613
614
615
616
617 rt_cache_stat[smp_processor_id()].gc_goal_miss++;
618
619 if (expire == 0)
620 break;
621
622 expire >>= 1;
623#if RT_CACHE_DEBUG >= 2
624 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
625 atomic_read(&ipv4_dst_ops.entries), goal, i);
626#endif
627
628 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
629 goto out;
630 } while (!in_softirq() && time_before_eq(jiffies, now));
631
632 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
633 goto out;
634 if (net_ratelimit())
635 printk(KERN_WARNING "dst cache overflow\n");
636 rt_cache_stat[smp_processor_id()].gc_dst_overflow++;
637 return 1;
638
639work_done:
640 expire += ip_rt_gc_min_interval;
641 if (expire > ip_rt_gc_timeout ||
642 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
643 expire = ip_rt_gc_timeout;
644#if RT_CACHE_DEBUG >= 2
645 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
646 atomic_read(&ipv4_dst_ops.entries), goal, rover);
647#endif
648out: return 0;
649}
650
651static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
652{
653 struct rtable *rth, **rthp;
654 unsigned long now;
655 struct rtable *cand, **candp;
656 u32 min_score;
657 int chain_length;
658 int attempts = !in_softirq();
659
660restart:
661 chain_length = 0;
662 min_score = ~(u32)0;
663 cand = NULL;
664 candp = NULL;
665 now = jiffies;
666
667 rthp = &rt_hash_table[hash].chain;
668
669 write_lock_bh(&rt_hash_table[hash].lock);
670 while ((rth = *rthp) != NULL) {
671 if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
672
673 *rthp = rth->u.rt_next;
674 rth->u.rt_next = rt_hash_table[hash].chain;
675 rt_hash_table[hash].chain = rth;
676
677 rth->u.dst.__use++;
678 dst_hold(&rth->u.dst);
679 rth->u.dst.lastuse = now;
680 write_unlock_bh(&rt_hash_table[hash].lock);
681
682 rt_drop(rt);
683 *rp = rth;
684 return 0;
685 }
686
687 if (!atomic_read(&rth->u.dst.__refcnt)) {
688 u32 score = rt_score(rth);
689
690 if (score <= min_score) {
691 cand = rth;
692 candp = rthp;
693 min_score = score;
694 }
695 }
696
697 chain_length++;
698
699 rthp = &rth->u.rt_next;
700 }
701
702 if (cand) {
703
704
705
706
707
708
709 if (chain_length > ip_rt_gc_elasticity) {
710 *candp = cand->u.rt_next;
711 rt_free(cand);
712 }
713 }
714
715
716
717
718 if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
719 int err = arp_bind_neighbour(&rt->u.dst);
720 if (err) {
721 write_unlock_bh(&rt_hash_table[hash].lock);
722
723 if (err != -ENOBUFS) {
724 rt_drop(rt);
725 return err;
726 }
727
728
729
730
731
732 if (attempts-- > 0) {
733 int saved_elasticity = ip_rt_gc_elasticity;
734 int saved_int = ip_rt_gc_min_interval;
735 ip_rt_gc_elasticity = 1;
736 ip_rt_gc_min_interval = 0;
737 rt_garbage_collect();
738 ip_rt_gc_min_interval = saved_int;
739 ip_rt_gc_elasticity = saved_elasticity;
740 goto restart;
741 }
742
743 if (net_ratelimit())
744 printk(KERN_WARNING "Neighbour table overflow.\n");
745 rt_drop(rt);
746 return -ENOBUFS;
747 }
748 }
749
750 rt->u.rt_next = rt_hash_table[hash].chain;
751#if RT_CACHE_DEBUG >= 2
752 if (rt->u.rt_next) {
753 struct rtable *trt;
754 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
755 NIPQUAD(rt->rt_dst));
756 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
757 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
758 printk("\n");
759 }
760#endif
761 rt_hash_table[hash].chain = rt;
762 write_unlock_bh(&rt_hash_table[hash].lock);
763 *rp = rt;
764 return 0;
765}
766
767void rt_bind_peer(struct rtable *rt, int create)
768{
769 static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED;
770 struct inet_peer *peer;
771
772 peer = inet_getpeer(rt->rt_dst, create);
773
774 spin_lock_bh(&rt_peer_lock);
775 if (rt->peer == NULL) {
776 rt->peer = peer;
777 peer = NULL;
778 }
779 spin_unlock_bh(&rt_peer_lock);
780 if (peer)
781 inet_putpeer(peer);
782}
783
784
785
786
787
788
789
790
791static void ip_select_fb_ident(struct iphdr *iph)
792{
793 static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED;
794 static u32 ip_fallback_id;
795 u32 salt;
796
797 spin_lock_bh(&ip_fb_id_lock);
798 salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
799 iph->id = htons(salt & 0xFFFF);
800 ip_fallback_id = salt;
801 spin_unlock_bh(&ip_fb_id_lock);
802}
803
804void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst)
805{
806 struct rtable *rt = (struct rtable *) dst;
807
808 if (rt) {
809 if (rt->peer == NULL)
810 rt_bind_peer(rt, 1);
811
812
813
814
815 if (rt->peer) {
816 iph->id = htons(inet_getid(rt->peer));
817 return;
818 }
819 } else
820 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
821
822 ip_select_fb_ident(iph);
823}
824
825static void rt_del(unsigned hash, struct rtable *rt)
826{
827 struct rtable **rthp;
828
829 write_lock_bh(&rt_hash_table[hash].lock);
830 ip_rt_put(rt);
831 for (rthp = &rt_hash_table[hash].chain; *rthp;
832 rthp = &(*rthp)->u.rt_next)
833 if (*rthp == rt) {
834 *rthp = rt->u.rt_next;
835 rt_free(rt);
836 break;
837 }
838 write_unlock_bh(&rt_hash_table[hash].lock);
839}
840
841void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
842 u32 saddr, u8 tos, struct net_device *dev)
843{
844 int i, k;
845 struct in_device *in_dev = in_dev_get(dev);
846 struct rtable *rth, **rthp;
847 u32 skeys[2] = { saddr, 0 };
848 int ikeys[2] = { dev->ifindex, 0 };
849
850 tos &= IPTOS_RT_MASK;
851
852 if (!in_dev)
853 return;
854
855 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
856 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
857 goto reject_redirect;
858
859 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
860 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
861 goto reject_redirect;
862 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
863 goto reject_redirect;
864 } else {
865 if (inet_addr_type(new_gw) != RTN_UNICAST)
866 goto reject_redirect;
867 }
868
869 for (i = 0; i < 2; i++) {
870 for (k = 0; k < 2; k++) {
871 unsigned hash = rt_hash_code(daddr,
872 skeys[i] ^ (ikeys[k] << 5),
873 tos);
874
875 rthp=&rt_hash_table[hash].chain;
876
877 read_lock(&rt_hash_table[hash].lock);
878 while ((rth = *rthp) != NULL) {
879 struct rtable *rt;
880
881 if (rth->key.dst != daddr ||
882 rth->key.src != skeys[i] ||
883 rth->key.tos != tos ||
884 rth->key.oif != ikeys[k] ||
885 rth->key.iif != 0) {
886 rthp = &rth->u.rt_next;
887 continue;
888 }
889
890 if (rth->rt_dst != daddr ||
891 rth->rt_src != saddr ||
892 rth->u.dst.error ||
893 rth->rt_gateway != old_gw ||
894 rth->u.dst.dev != dev)
895 break;
896
897 dst_hold(&rth->u.dst);
898 read_unlock(&rt_hash_table[hash].lock);
899
900 rt = dst_alloc(&ipv4_dst_ops);
901 if (rt == NULL) {
902 ip_rt_put(rth);
903 in_dev_put(in_dev);
904 return;
905 }
906
907
908 *rt = *rth;
909 rt->u.dst.__use = 1;
910 atomic_set(&rt->u.dst.__refcnt, 1);
911 if (rt->u.dst.dev)
912 dev_hold(rt->u.dst.dev);
913 rt->u.dst.lastuse = jiffies;
914 rt->u.dst.neighbour = NULL;
915 rt->u.dst.hh = NULL;
916 rt->u.dst.obsolete = 0;
917
918 rt->rt_flags |= RTCF_REDIRECTED;
919
920
921 rt->rt_gateway = new_gw;
922
923
924 dst_confirm(&rth->u.dst);
925
926 if (rt->peer)
927 atomic_inc(&rt->peer->refcnt);
928
929 if (arp_bind_neighbour(&rt->u.dst) ||
930 !(rt->u.dst.neighbour->nud_state &
931 NUD_VALID)) {
932 if (rt->u.dst.neighbour)
933 neigh_event_send(rt->u.dst.neighbour, NULL);
934 ip_rt_put(rth);
935 rt_drop(rt);
936 goto do_next;
937 }
938
939 rt_del(hash, rth);
940 if (!rt_intern_hash(hash, rt, &rt))
941 ip_rt_put(rt);
942 goto do_next;
943 }
944 read_unlock(&rt_hash_table[hash].lock);
945 do_next:
946 ;
947 }
948 }
949 in_dev_put(in_dev);
950 return;
951
952reject_redirect:
953#ifdef CONFIG_IP_ROUTE_VERBOSE
954 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
955 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
956 "%u.%u.%u.%u ignored.\n"
957 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
958 "tos %02x\n",
959 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
960 NIPQUAD(saddr), NIPQUAD(daddr), tos);
961#endif
962 in_dev_put(in_dev);
963}
964
965static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
966{
967 struct rtable *rt = (struct rtable*)dst;
968 struct dst_entry *ret = dst;
969
970 if (rt) {
971 if (dst->obsolete) {
972 ip_rt_put(rt);
973 ret = NULL;
974 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
975 rt->u.dst.expires) {
976 unsigned hash = rt_hash_code(rt->key.dst,
977 rt->key.src ^
978 (rt->key.oif << 5),
979 rt->key.tos);
980#if RT_CACHE_DEBUG >= 1
981 printk(KERN_DEBUG "ip_rt_advice: redirect to "
982 "%u.%u.%u.%u/%02x dropped\n",
983 NIPQUAD(rt->rt_dst), rt->key.tos);
984#endif
985 rt_del(hash, rt);
986 ret = NULL;
987 }
988 }
989 return ret;
990}
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008void ip_rt_send_redirect(struct sk_buff *skb)
1009{
1010 struct rtable *rt = (struct rtable*)skb->dst;
1011 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1012
1013 if (!in_dev)
1014 return;
1015
1016 if (!IN_DEV_TX_REDIRECTS(in_dev))
1017 goto out;
1018
1019
1020
1021
1022 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1023 rt->u.dst.rate_tokens = 0;
1024
1025
1026
1027
1028 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1029 rt->u.dst.rate_last = jiffies;
1030 goto out;
1031 }
1032
1033
1034
1035
1036 if (time_after(jiffies,
1037 (rt->u.dst.rate_last +
1038 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1039 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1040 rt->u.dst.rate_last = jiffies;
1041 ++rt->u.dst.rate_tokens;
1042#ifdef CONFIG_IP_ROUTE_VERBOSE
1043 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1044 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1045 net_ratelimit())
1046 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1047 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1048 NIPQUAD(rt->rt_src), rt->rt_iif,
1049 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1050#endif
1051 }
1052out:
1053 in_dev_put(in_dev);
1054}
1055
1056static int ip_error(struct sk_buff *skb)
1057{
1058 struct rtable *rt = (struct rtable*)skb->dst;
1059 unsigned long now;
1060 int code;
1061
1062 switch (rt->u.dst.error) {
1063 case EINVAL:
1064 default:
1065 goto out;
1066 case EHOSTUNREACH:
1067 code = ICMP_HOST_UNREACH;
1068 break;
1069 case ENETUNREACH:
1070 code = ICMP_NET_UNREACH;
1071 break;
1072 case EACCES:
1073 code = ICMP_PKT_FILTERED;
1074 break;
1075 }
1076
1077 now = jiffies;
1078 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1079 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1080 rt->u.dst.rate_tokens = ip_rt_error_burst;
1081 rt->u.dst.rate_last = now;
1082 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1083 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1084 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1085 }
1086
1087out: kfree_skb(skb);
1088 return 0;
1089}
1090
1091
1092
1093
1094
1095
1096static unsigned short mtu_plateau[] =
1097{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1098
1099static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1100{
1101 int i;
1102
1103 for (i = 0; i < sizeof(mtu_plateau) / sizeof(mtu_plateau[0]); i++)
1104 if (old_mtu > mtu_plateau[i])
1105 return mtu_plateau[i];
1106 return 68;
1107}
1108
1109unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1110{
1111 int i;
1112 unsigned short old_mtu = ntohs(iph->tot_len);
1113 struct rtable *rth;
1114 u32 skeys[2] = { iph->saddr, 0, };
1115 u32 daddr = iph->daddr;
1116 u8 tos = iph->tos & IPTOS_RT_MASK;
1117 unsigned short est_mtu = 0;
1118
1119 if (ipv4_config.no_pmtu_disc)
1120 return 0;
1121
1122 for (i = 0; i < 2; i++) {
1123 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1124
1125 read_lock(&rt_hash_table[hash].lock);
1126 for (rth = rt_hash_table[hash].chain; rth;
1127 rth = rth->u.rt_next) {
1128 if (rth->key.dst == daddr &&
1129 rth->key.src == skeys[i] &&
1130 rth->rt_dst == daddr &&
1131 rth->rt_src == iph->saddr &&
1132 rth->key.tos == tos &&
1133 rth->key.iif == 0 &&
1134 !(rth->u.dst.mxlock & (1 << RTAX_MTU))) {
1135 unsigned short mtu = new_mtu;
1136
1137 if (new_mtu < 68 || new_mtu >= old_mtu) {
1138
1139
1140 if (mtu == 0 &&
1141 old_mtu >= rth->u.dst.pmtu &&
1142 old_mtu >= 68 + (iph->ihl << 2))
1143 old_mtu -= iph->ihl << 2;
1144
1145 mtu = guess_mtu(old_mtu);
1146 }
1147 if (mtu <= rth->u.dst.pmtu) {
1148 if (mtu < rth->u.dst.pmtu) {
1149 dst_confirm(&rth->u.dst);
1150 if (mtu < ip_rt_min_pmtu) {
1151 mtu = ip_rt_min_pmtu;
1152 rth->u.dst.mxlock |=
1153 (1 << RTAX_MTU);
1154 }
1155 rth->u.dst.pmtu = mtu;
1156 dst_set_expires(&rth->u.dst,
1157 ip_rt_mtu_expires);
1158 }
1159 est_mtu = mtu;
1160 }
1161 }
1162 }
1163 read_unlock(&rt_hash_table[hash].lock);
1164 }
1165 return est_mtu ? : new_mtu;
1166}
1167
1168void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu)
1169{
1170 if (dst->pmtu > mtu && mtu >= 68 &&
1171 !(dst->mxlock & (1 << RTAX_MTU))) {
1172 if (mtu < ip_rt_min_pmtu) {
1173 mtu = ip_rt_min_pmtu;
1174 dst->mxlock |= (1 << RTAX_MTU);
1175 }
1176 dst->pmtu = mtu;
1177 dst_set_expires(dst, ip_rt_mtu_expires);
1178 }
1179}
1180
1181static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1182{
1183 dst_release(dst);
1184 return NULL;
1185}
1186
1187static struct dst_entry *ipv4_dst_reroute(struct dst_entry *dst,
1188 struct sk_buff *skb)
1189{
1190 return NULL;
1191}
1192
1193static void ipv4_dst_destroy(struct dst_entry *dst)
1194{
1195 struct rtable *rt = (struct rtable *) dst;
1196 struct inet_peer *peer = rt->peer;
1197
1198 if (peer) {
1199 rt->peer = NULL;
1200 inet_putpeer(peer);
1201 }
1202}
1203
1204static void ipv4_link_failure(struct sk_buff *skb)
1205{
1206 struct rtable *rt;
1207
1208 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1209
1210 rt = (struct rtable *) skb->dst;
1211 if (rt)
1212 dst_set_expires(&rt->u.dst, 0);
1213}
1214
1215static int ip_rt_bug(struct sk_buff *skb)
1216{
1217 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1218 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1219 skb->dev ? skb->dev->name : "?");
1220 kfree_skb(skb);
1221 return 0;
1222}
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233void ip_rt_get_source(u8 *addr, struct rtable *rt)
1234{
1235 u32 src;
1236 struct fib_result res;
1237
1238 if (rt->key.iif == 0)
1239 src = rt->rt_src;
1240 else if (fib_lookup(&rt->key, &res) == 0) {
1241#ifdef CONFIG_IP_ROUTE_NAT
1242 if (res.type == RTN_NAT)
1243 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1244 RT_SCOPE_UNIVERSE);
1245 else
1246#endif
1247 src = FIB_RES_PREFSRC(res);
1248 fib_res_put(&res);
1249 } else
1250 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1251 RT_SCOPE_UNIVERSE);
1252 memcpy(addr, &src, 4);
1253}
1254
1255#ifdef CONFIG_NET_CLS_ROUTE
1256static void set_class_tag(struct rtable *rt, u32 tag)
1257{
1258 if (!(rt->u.dst.tclassid & 0xFFFF))
1259 rt->u.dst.tclassid |= tag & 0xFFFF;
1260 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1261 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1262}
1263#endif
1264
1265static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1266{
1267 struct fib_info *fi = res->fi;
1268
1269 if (fi) {
1270 if (FIB_RES_GW(*res) &&
1271 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1272 rt->rt_gateway = FIB_RES_GW(*res);
1273 memcpy(&rt->u.dst.mxlock, fi->fib_metrics,
1274 sizeof(fi->fib_metrics));
1275 if (fi->fib_mtu == 0) {
1276 rt->u.dst.pmtu = rt->u.dst.dev->mtu;
1277 if (rt->u.dst.mxlock & (1 << RTAX_MTU) &&
1278 rt->rt_gateway != rt->rt_dst &&
1279 rt->u.dst.pmtu > 576)
1280 rt->u.dst.pmtu = 576;
1281 }
1282#ifdef CONFIG_NET_CLS_ROUTE
1283 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1284#endif
1285 } else
1286 rt->u.dst.pmtu = rt->u.dst.dev->mtu;
1287
1288 if (rt->u.dst.pmtu > IP_MAX_MTU)
1289 rt->u.dst.pmtu = IP_MAX_MTU;
1290 if (rt->u.dst.advmss == 0)
1291 rt->u.dst.advmss = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1292 ip_rt_min_advmss);
1293 if (rt->u.dst.advmss > 65535 - 40)
1294 rt->u.dst.advmss = 65535 - 40;
1295
1296#ifdef CONFIG_NET_CLS_ROUTE
1297#ifdef CONFIG_IP_MULTIPLE_TABLES
1298 set_class_tag(rt, fib_rules_tclass(res));
1299#endif
1300 set_class_tag(rt, itag);
1301#endif
1302 rt->rt_type = res->type;
1303}
1304
1305static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1306 u8 tos, struct net_device *dev, int our)
1307{
1308 unsigned hash;
1309 struct rtable *rth;
1310 u32 spec_dst;
1311 struct in_device *in_dev = in_dev_get(dev);
1312 u32 itag = 0;
1313
1314
1315
1316 if (in_dev == NULL)
1317 return -EINVAL;
1318
1319 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1320 skb->protocol != htons(ETH_P_IP))
1321 goto e_inval;
1322
1323 if (ZERONET(saddr)) {
1324 if (!LOCAL_MCAST(daddr))
1325 goto e_inval;
1326 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1327 } else if (fib_validate_source(saddr, 0, tos, 0,
1328 dev, &spec_dst, &itag) < 0)
1329 goto e_inval;
1330
1331 rth = dst_alloc(&ipv4_dst_ops);
1332 if (!rth)
1333 goto e_nobufs;
1334
1335 rth->u.dst.output= ip_rt_bug;
1336
1337 atomic_set(&rth->u.dst.__refcnt, 1);
1338 rth->u.dst.flags= DST_HOST;
1339 rth->key.dst = daddr;
1340 rth->rt_dst = daddr;
1341 rth->key.tos = tos;
1342#ifdef CONFIG_IP_ROUTE_FWMARK
1343 rth->key.fwmark = skb->nfmark;
1344#endif
1345 rth->key.src = saddr;
1346 rth->rt_src = saddr;
1347#ifdef CONFIG_IP_ROUTE_NAT
1348 rth->rt_dst_map = daddr;
1349 rth->rt_src_map = saddr;
1350#endif
1351#ifdef CONFIG_NET_CLS_ROUTE
1352 rth->u.dst.tclassid = itag;
1353#endif
1354 rth->rt_iif =
1355 rth->key.iif = dev->ifindex;
1356 rth->u.dst.dev = &loopback_dev;
1357 dev_hold(rth->u.dst.dev);
1358 rth->key.oif = 0;
1359 rth->rt_gateway = daddr;
1360 rth->rt_spec_dst= spec_dst;
1361 rth->rt_type = RTN_MULTICAST;
1362 rth->rt_flags = RTCF_MULTICAST;
1363 if (our) {
1364 rth->u.dst.input= ip_local_deliver;
1365 rth->rt_flags |= RTCF_LOCAL;
1366 }
1367
1368#ifdef CONFIG_IP_MROUTE
1369 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1370 rth->u.dst.input = ip_mr_input;
1371#endif
1372 rt_cache_stat[smp_processor_id()].in_slow_mc++;
1373
1374 in_dev_put(in_dev);
1375 hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1376 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1377
1378e_nobufs:
1379 in_dev_put(in_dev);
1380 return -ENOBUFS;
1381
1382e_inval:
1383 in_dev_put(in_dev);
1384 return -EINVAL;
1385}
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1398 u8 tos, struct net_device *dev)
1399{
1400 struct rt_key key;
1401 struct fib_result res;
1402 struct in_device *in_dev = in_dev_get(dev);
1403 struct in_device *out_dev = NULL;
1404 unsigned flags = 0;
1405 u32 itag = 0;
1406 struct rtable * rth;
1407 unsigned hash;
1408 u32 spec_dst;
1409 int err = -EINVAL;
1410 int free_res = 0;
1411
1412
1413
1414 if (!in_dev)
1415 goto out;
1416
1417 key.dst = daddr;
1418 key.src = saddr;
1419 key.tos = tos;
1420#ifdef CONFIG_IP_ROUTE_FWMARK
1421 key.fwmark = skb->nfmark;
1422#endif
1423 key.iif = dev->ifindex;
1424 key.oif = 0;
1425 key.scope = RT_SCOPE_UNIVERSE;
1426
1427 hash = rt_hash_code(daddr, saddr ^ (key.iif << 5), tos);
1428
1429
1430
1431
1432
1433 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1434 goto martian_source;
1435
1436 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1437 goto brd_input;
1438
1439
1440
1441
1442 if (ZERONET(saddr))
1443 goto martian_source;
1444
1445 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1446 goto martian_destination;
1447
1448
1449
1450
1451 if ((err = fib_lookup(&key, &res)) != 0) {
1452 if (!IN_DEV_FORWARD(in_dev))
1453 goto e_inval;
1454 goto no_route;
1455 }
1456 free_res = 1;
1457
1458 rt_cache_stat[smp_processor_id()].in_slow_tot++;
1459
1460#ifdef CONFIG_IP_ROUTE_NAT
1461
1462
1463
1464
1465 if (1) {
1466 u32 src_map = saddr;
1467 if (res.r)
1468 src_map = fib_rules_policy(saddr, &res, &flags);
1469
1470 if (res.type == RTN_NAT) {
1471 key.dst = fib_rules_map_destination(daddr, &res);
1472 fib_res_put(&res);
1473 free_res = 0;
1474 if (fib_lookup(&key, &res))
1475 goto e_inval;
1476 free_res = 1;
1477 if (res.type != RTN_UNICAST)
1478 goto e_inval;
1479 flags |= RTCF_DNAT;
1480 }
1481 key.src = src_map;
1482 }
1483#endif
1484
1485 if (res.type == RTN_BROADCAST)
1486 goto brd_input;
1487
1488 if (res.type == RTN_LOCAL) {
1489 int result;
1490 result = fib_validate_source(saddr, daddr, tos,
1491 loopback_dev.ifindex,
1492 dev, &spec_dst, &itag);
1493 if (result < 0)
1494 goto martian_source;
1495 if (result)
1496 flags |= RTCF_DIRECTSRC;
1497 spec_dst = daddr;
1498 goto local_input;
1499 }
1500
1501 if (!IN_DEV_FORWARD(in_dev))
1502 goto e_inval;
1503 if (res.type != RTN_UNICAST)
1504 goto martian_destination;
1505
1506#ifdef CONFIG_IP_ROUTE_MULTIPATH
1507 if (res.fi->fib_nhs > 1 && key.oif == 0)
1508 fib_select_multipath(&key, &res);
1509#endif
1510 out_dev = in_dev_get(FIB_RES_DEV(res));
1511 if (out_dev == NULL) {
1512 if (net_ratelimit())
1513 printk(KERN_CRIT "Bug in ip_route_input_slow(). "
1514 "Please, report\n");
1515 goto e_inval;
1516 }
1517
1518 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev,
1519 &spec_dst, &itag);
1520 if (err < 0)
1521 goto martian_source;
1522
1523 if (err)
1524 flags |= RTCF_DIRECTSRC;
1525
1526 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1527 (IN_DEV_SHARED_MEDIA(out_dev) ||
1528 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1529 flags |= RTCF_DOREDIRECT;
1530
1531 if (skb->protocol != htons(ETH_P_IP)) {
1532
1533
1534
1535 if (out_dev == in_dev && !(flags & RTCF_DNAT))
1536 goto e_inval;
1537 }
1538
1539 rth = dst_alloc(&ipv4_dst_ops);
1540 if (!rth)
1541 goto e_nobufs;
1542
1543 atomic_set(&rth->u.dst.__refcnt, 1);
1544 rth->u.dst.flags= DST_HOST;
1545 rth->key.dst = daddr;
1546 rth->rt_dst = daddr;
1547 rth->key.tos = tos;
1548#ifdef CONFIG_IP_ROUTE_FWMARK
1549 rth->key.fwmark = skb->nfmark;
1550#endif
1551 rth->key.src = saddr;
1552 rth->rt_src = saddr;
1553 rth->rt_gateway = daddr;
1554#ifdef CONFIG_IP_ROUTE_NAT
1555 rth->rt_src_map = key.src;
1556 rth->rt_dst_map = key.dst;
1557 if (flags&RTCF_DNAT)
1558 rth->rt_gateway = key.dst;
1559#endif
1560 rth->rt_iif =
1561 rth->key.iif = dev->ifindex;
1562 rth->u.dst.dev = out_dev->dev;
1563 dev_hold(rth->u.dst.dev);
1564 rth->key.oif = 0;
1565 rth->rt_spec_dst= spec_dst;
1566
1567 rth->u.dst.input = ip_forward;
1568 rth->u.dst.output = ip_output;
1569
1570 rt_set_nexthop(rth, &res, itag);
1571
1572 rth->rt_flags = flags;
1573
1574#ifdef CONFIG_NET_FASTROUTE
1575 if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
1576 struct net_device *odev = rth->u.dst.dev;
1577 if (odev != dev &&
1578 dev->accept_fastpath &&
1579 odev->mtu >= dev->mtu &&
1580 dev->accept_fastpath(dev, &rth->u.dst) == 0)
1581 rth->rt_flags |= RTCF_FAST;
1582 }
1583#endif
1584
1585intern:
1586 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1587done:
1588 in_dev_put(in_dev);
1589 if (out_dev)
1590 in_dev_put(out_dev);
1591 if (free_res)
1592 fib_res_put(&res);
1593out: return err;
1594
1595brd_input:
1596 if (skb->protocol != htons(ETH_P_IP))
1597 goto e_inval;
1598
1599 if (ZERONET(saddr))
1600 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1601 else {
1602 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1603 &itag);
1604 if (err < 0)
1605 goto martian_source;
1606 if (err)
1607 flags |= RTCF_DIRECTSRC;
1608 }
1609 flags |= RTCF_BROADCAST;
1610 res.type = RTN_BROADCAST;
1611 rt_cache_stat[smp_processor_id()].in_brd++;
1612
1613local_input:
1614 rth = dst_alloc(&ipv4_dst_ops);
1615 if (!rth)
1616 goto e_nobufs;
1617
1618 rth->u.dst.output= ip_rt_bug;
1619
1620 atomic_set(&rth->u.dst.__refcnt, 1);
1621 rth->u.dst.flags= DST_HOST;
1622 rth->key.dst = daddr;
1623 rth->rt_dst = daddr;
1624 rth->key.tos = tos;
1625#ifdef CONFIG_IP_ROUTE_FWMARK
1626 rth->key.fwmark = skb->nfmark;
1627#endif
1628 rth->key.src = saddr;
1629 rth->rt_src = saddr;
1630#ifdef CONFIG_IP_ROUTE_NAT
1631 rth->rt_dst_map = key.dst;
1632 rth->rt_src_map = key.src;
1633#endif
1634#ifdef CONFIG_NET_CLS_ROUTE
1635 rth->u.dst.tclassid = itag;
1636#endif
1637 rth->rt_iif =
1638 rth->key.iif = dev->ifindex;
1639 rth->u.dst.dev = &loopback_dev;
1640 dev_hold(rth->u.dst.dev);
1641 rth->key.oif = 0;
1642 rth->rt_gateway = daddr;
1643 rth->rt_spec_dst= spec_dst;
1644 rth->u.dst.input= ip_local_deliver;
1645 rth->rt_flags = flags|RTCF_LOCAL;
1646 if (res.type == RTN_UNREACHABLE) {
1647 rth->u.dst.input= ip_error;
1648 rth->u.dst.error= -err;
1649 rth->rt_flags &= ~RTCF_LOCAL;
1650 }
1651 rth->rt_type = res.type;
1652 goto intern;
1653
1654no_route:
1655 rt_cache_stat[smp_processor_id()].in_no_route++;
1656 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1657 res.type = RTN_UNREACHABLE;
1658 goto local_input;
1659
1660
1661
1662
1663martian_destination:
1664 rt_cache_stat[smp_processor_id()].in_martian_dst++;
1665#ifdef CONFIG_IP_ROUTE_VERBOSE
1666 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1667 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1668 "%u.%u.%u.%u, dev %s\n",
1669 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1670#endif
1671e_inval:
1672 err = -EINVAL;
1673 goto done;
1674
1675e_nobufs:
1676 err = -ENOBUFS;
1677 goto done;
1678
1679martian_source:
1680
1681 rt_cache_stat[smp_processor_id()].in_martian_src++;
1682#ifdef CONFIG_IP_ROUTE_VERBOSE
1683 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1684
1685
1686
1687
1688 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1689 "%u.%u.%u.%u, on dev %s\n",
1690 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1691 if (dev->hard_header_len) {
1692 int i;
1693 unsigned char *p = skb->mac.raw;
1694 printk(KERN_WARNING "ll header: ");
1695 for (i = 0; i < dev->hard_header_len; i++, p++) {
1696 printk("%02x", *p);
1697 if (i < (dev->hard_header_len - 1))
1698 printk(":");
1699 }
1700 printk("\n");
1701 }
1702 }
1703#endif
1704 goto e_inval;
1705}
1706
1707int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1708 u8 tos, struct net_device *dev)
1709{
1710 struct rtable * rth;
1711 unsigned hash;
1712 int iif = dev->ifindex;
1713
1714 tos &= IPTOS_RT_MASK;
1715 hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
1716
1717 read_lock(&rt_hash_table[hash].lock);
1718 for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
1719 if (rth->key.dst == daddr &&
1720 rth->key.src == saddr &&
1721 rth->key.iif == iif &&
1722 rth->key.oif == 0 &&
1723#ifdef CONFIG_IP_ROUTE_FWMARK
1724 rth->key.fwmark == skb->nfmark &&
1725#endif
1726 rth->key.tos == tos) {
1727 rth->u.dst.lastuse = jiffies;
1728 dst_hold(&rth->u.dst);
1729 rth->u.dst.__use++;
1730 rt_cache_stat[smp_processor_id()].in_hit++;
1731 read_unlock(&rt_hash_table[hash].lock);
1732 skb->dst = (struct dst_entry*)rth;
1733 return 0;
1734 }
1735 rt_cache_stat[smp_processor_id()].in_hlist_search++;
1736 }
1737 read_unlock(&rt_hash_table[hash].lock);
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750 if (MULTICAST(daddr)) {
1751 struct in_device *in_dev;
1752
1753 read_lock(&inetdev_lock);
1754 if ((in_dev = __in_dev_get(dev)) != NULL) {
1755 int our = ip_check_mc(in_dev, daddr, saddr);
1756 if (our
1757#ifdef CONFIG_IP_MROUTE
1758 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1759#endif
1760 ) {
1761 read_unlock(&inetdev_lock);
1762 return ip_route_input_mc(skb, daddr, saddr,
1763 tos, dev, our);
1764 }
1765 }
1766 read_unlock(&inetdev_lock);
1767 return -EINVAL;
1768 }
1769 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1770}
1771
1772
1773
1774
1775
1776int ip_route_output_slow(struct rtable **rp, const struct rt_key *oldkey)
1777{
1778 struct rt_key key;
1779 struct fib_result res;
1780 unsigned flags = 0;
1781 struct rtable *rth;
1782 struct net_device *dev_out = NULL;
1783 unsigned hash;
1784 int free_res = 0;
1785 int err;
1786 u32 tos;
1787
1788 tos = oldkey->tos & (IPTOS_RT_MASK | RTO_ONLINK);
1789 key.dst = oldkey->dst;
1790 key.src = oldkey->src;
1791 key.tos = tos & IPTOS_RT_MASK;
1792 key.iif = loopback_dev.ifindex;
1793 key.oif = oldkey->oif;
1794#ifdef CONFIG_IP_ROUTE_FWMARK
1795 key.fwmark = oldkey->fwmark;
1796#endif
1797 key.scope = (tos & RTO_ONLINK) ? RT_SCOPE_LINK :
1798 RT_SCOPE_UNIVERSE;
1799 res.fi = NULL;
1800#ifdef CONFIG_IP_MULTIPLE_TABLES
1801 res.r = NULL;
1802#endif
1803
1804 if (oldkey->src) {
1805 err = -EINVAL;
1806 if (MULTICAST(oldkey->src) ||
1807 BADCLASS(oldkey->src) ||
1808 ZERONET(oldkey->src))
1809 goto out;
1810
1811
1812 dev_out = ip_dev_find(oldkey->src);
1813 if (dev_out == NULL)
1814 goto out;
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824 if (oldkey->oif == 0
1825 && (MULTICAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF)) {
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841 key.oif = dev_out->ifindex;
1842 goto make_route;
1843 }
1844 if (dev_out)
1845 dev_put(dev_out);
1846 dev_out = NULL;
1847 }
1848 if (oldkey->oif) {
1849 dev_out = dev_get_by_index(oldkey->oif);
1850 err = -ENODEV;
1851 if (dev_out == NULL)
1852 goto out;
1853 if (__in_dev_get(dev_out) == NULL) {
1854 dev_put(dev_out);
1855 goto out;
1856 }
1857
1858 if (LOCAL_MCAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF) {
1859 if (!key.src)
1860 key.src = inet_select_addr(dev_out, 0,
1861 RT_SCOPE_LINK);
1862 goto make_route;
1863 }
1864 if (!key.src) {
1865 if (MULTICAST(oldkey->dst))
1866 key.src = inet_select_addr(dev_out, 0,
1867 key.scope);
1868 else if (!oldkey->dst)
1869 key.src = inet_select_addr(dev_out, 0,
1870 RT_SCOPE_HOST);
1871 }
1872 }
1873
1874 if (!key.dst) {
1875 key.dst = key.src;
1876 if (!key.dst)
1877 key.dst = key.src = htonl(INADDR_LOOPBACK);
1878 if (dev_out)
1879 dev_put(dev_out);
1880 dev_out = &loopback_dev;
1881 dev_hold(dev_out);
1882 key.oif = loopback_dev.ifindex;
1883 res.type = RTN_LOCAL;
1884 flags |= RTCF_LOCAL;
1885 goto make_route;
1886 }
1887
1888 if (fib_lookup(&key, &res)) {
1889 res.fi = NULL;
1890 if (oldkey->oif) {
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909 if (key.src == 0)
1910 key.src = inet_select_addr(dev_out, 0,
1911 RT_SCOPE_LINK);
1912 res.type = RTN_UNICAST;
1913 goto make_route;
1914 }
1915 if (dev_out)
1916 dev_put(dev_out);
1917 err = -ENETUNREACH;
1918 goto out;
1919 }
1920 free_res = 1;
1921
1922 if (res.type == RTN_NAT)
1923 goto e_inval;
1924
1925 if (res.type == RTN_LOCAL) {
1926 if (!key.src)
1927 key.src = key.dst;
1928 if (dev_out)
1929 dev_put(dev_out);
1930 dev_out = &loopback_dev;
1931 dev_hold(dev_out);
1932 key.oif = dev_out->ifindex;
1933 if (res.fi)
1934 fib_info_put(res.fi);
1935 res.fi = NULL;
1936 flags |= RTCF_LOCAL;
1937 goto make_route;
1938 }
1939
1940#ifdef CONFIG_IP_ROUTE_MULTIPATH
1941 if (res.fi->fib_nhs > 1 && key.oif == 0)
1942 fib_select_multipath(&key, &res);
1943 else
1944#endif
1945 if (!res.prefixlen && res.type == RTN_UNICAST && !key.oif)
1946 fib_select_default(&key, &res);
1947
1948 if (!key.src)
1949 key.src = FIB_RES_PREFSRC(res);
1950
1951 if (dev_out)
1952 dev_put(dev_out);
1953 dev_out = FIB_RES_DEV(res);
1954 dev_hold(dev_out);
1955 key.oif = dev_out->ifindex;
1956
1957make_route:
1958 if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK))
1959 goto e_inval;
1960
1961 if (key.dst == 0xFFFFFFFF)
1962 res.type = RTN_BROADCAST;
1963 else if (MULTICAST(key.dst))
1964 res.type = RTN_MULTICAST;
1965 else if (BADCLASS(key.dst) || ZERONET(key.dst))
1966 goto e_inval;
1967
1968 if (dev_out->flags & IFF_LOOPBACK)
1969 flags |= RTCF_LOCAL;
1970
1971 if (res.type == RTN_BROADCAST) {
1972 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1973 if (res.fi) {
1974 fib_info_put(res.fi);
1975 res.fi = NULL;
1976 }
1977 } else if (res.type == RTN_MULTICAST) {
1978 flags |= RTCF_MULTICAST|RTCF_LOCAL;
1979 read_lock(&inetdev_lock);
1980 if (!__in_dev_get(dev_out) ||
1981 !ip_check_mc(__in_dev_get(dev_out),oldkey->dst,oldkey->src))
1982 flags &= ~RTCF_LOCAL;
1983 read_unlock(&inetdev_lock);
1984
1985
1986
1987
1988 if (res.fi && res.prefixlen < 4) {
1989 fib_info_put(res.fi);
1990 res.fi = NULL;
1991 }
1992 }
1993
1994 rth = dst_alloc(&ipv4_dst_ops);
1995 if (!rth)
1996 goto e_nobufs;
1997
1998 atomic_set(&rth->u.dst.__refcnt, 1);
1999 rth->u.dst.flags= DST_HOST;
2000 rth->key.dst = oldkey->dst;
2001 rth->key.tos = tos;
2002 rth->key.src = oldkey->src;
2003 rth->key.iif = 0;
2004 rth->key.oif = oldkey->oif;
2005#ifdef CONFIG_IP_ROUTE_FWMARK
2006 rth->key.fwmark = oldkey->fwmark;
2007#endif
2008 rth->rt_dst = key.dst;
2009 rth->rt_src = key.src;
2010#ifdef CONFIG_IP_ROUTE_NAT
2011 rth->rt_dst_map = key.dst;
2012 rth->rt_src_map = key.src;
2013#endif
2014 rth->rt_iif = oldkey->oif ? : dev_out->ifindex;
2015 rth->u.dst.dev = dev_out;
2016 dev_hold(dev_out);
2017 rth->rt_gateway = key.dst;
2018 rth->rt_spec_dst= key.src;
2019
2020 rth->u.dst.output=ip_output;
2021
2022 rt_cache_stat[smp_processor_id()].out_slow_tot++;
2023
2024 if (flags & RTCF_LOCAL) {
2025 rth->u.dst.input = ip_local_deliver;
2026 rth->rt_spec_dst = key.dst;
2027 }
2028 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2029 rth->rt_spec_dst = key.src;
2030 if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) {
2031 rth->u.dst.output = ip_mc_output;
2032 rt_cache_stat[smp_processor_id()].out_slow_mc++;
2033 }
2034#ifdef CONFIG_IP_MROUTE
2035 if (res.type == RTN_MULTICAST) {
2036 struct in_device *in_dev = in_dev_get(dev_out);
2037 if (in_dev) {
2038 if (IN_DEV_MFORWARD(in_dev) &&
2039 !LOCAL_MCAST(oldkey->dst)) {
2040 rth->u.dst.input = ip_mr_input;
2041 rth->u.dst.output = ip_mc_output;
2042 }
2043 in_dev_put(in_dev);
2044 }
2045 }
2046#endif
2047 }
2048
2049 rt_set_nexthop(rth, &res, 0);
2050
2051 rth->rt_flags = flags;
2052
2053 hash = rt_hash_code(oldkey->dst, oldkey->src ^ (oldkey->oif << 5), tos);
2054 err = rt_intern_hash(hash, rth, rp);
2055done:
2056 if (free_res)
2057 fib_res_put(&res);
2058 if (dev_out)
2059 dev_put(dev_out);
2060out: return err;
2061
2062e_inval:
2063 err = -EINVAL;
2064 goto done;
2065e_nobufs:
2066 err = -ENOBUFS;
2067 goto done;
2068}
2069
2070int ip_route_output_key(struct rtable **rp, const struct rt_key *key)
2071{
2072 unsigned hash;
2073 struct rtable *rth;
2074
2075 hash = rt_hash_code(key->dst, key->src ^ (key->oif << 5), key->tos);
2076
2077 read_lock_bh(&rt_hash_table[hash].lock);
2078 for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
2079 if (rth->key.dst == key->dst &&
2080 rth->key.src == key->src &&
2081 rth->key.iif == 0 &&
2082 rth->key.oif == key->oif &&
2083#ifdef CONFIG_IP_ROUTE_FWMARK
2084 rth->key.fwmark == key->fwmark &&
2085#endif
2086 !((rth->key.tos ^ key->tos) &
2087 (IPTOS_RT_MASK | RTO_ONLINK))) {
2088 rth->u.dst.lastuse = jiffies;
2089 dst_hold(&rth->u.dst);
2090 rth->u.dst.__use++;
2091 rt_cache_stat[smp_processor_id()].out_hit++;
2092 read_unlock_bh(&rt_hash_table[hash].lock);
2093 *rp = rth;
2094 return 0;
2095 }
2096 rt_cache_stat[smp_processor_id()].out_hlist_search++;
2097 }
2098 read_unlock_bh(&rt_hash_table[hash].lock);
2099
2100 return ip_route_output_slow(rp, key);
2101}
2102
2103static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2104 int nowait)
2105{
2106 struct rtable *rt = (struct rtable*)skb->dst;
2107 struct rtmsg *r;
2108 struct nlmsghdr *nlh;
2109 unsigned char *b = skb->tail;
2110 struct rta_cacheinfo ci;
2111#ifdef CONFIG_IP_MROUTE
2112 struct rtattr *eptr;
2113#endif
2114 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
2115 r = NLMSG_DATA(nlh);
2116 nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2117 r->rtm_family = AF_INET;
2118 r->rtm_dst_len = 32;
2119 r->rtm_src_len = 0;
2120 r->rtm_tos = rt->key.tos;
2121 r->rtm_table = RT_TABLE_MAIN;
2122 r->rtm_type = rt->rt_type;
2123 r->rtm_scope = RT_SCOPE_UNIVERSE;
2124 r->rtm_protocol = RTPROT_UNSPEC;
2125 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2126 if (rt->rt_flags & RTCF_NOTIFY)
2127 r->rtm_flags |= RTM_F_NOTIFY;
2128 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2129 if (rt->key.src) {
2130 r->rtm_src_len = 32;
2131 RTA_PUT(skb, RTA_SRC, 4, &rt->key.src);
2132 }
2133 if (rt->u.dst.dev)
2134 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2135#ifdef CONFIG_NET_CLS_ROUTE
2136 if (rt->u.dst.tclassid)
2137 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2138#endif
2139 if (rt->key.iif)
2140 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2141 else if (rt->rt_src != rt->key.src)
2142 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2143 if (rt->rt_dst != rt->rt_gateway)
2144 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2145 if (rtnetlink_put_metrics(skb, &rt->u.dst.mxlock) < 0)
2146 goto rtattr_failure;
2147 ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
2148 ci.rta_used = rt->u.dst.__use;
2149 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2150 if (rt->u.dst.expires)
2151 ci.rta_expires = rt->u.dst.expires - jiffies;
2152 else
2153 ci.rta_expires = 0;
2154 ci.rta_error = rt->u.dst.error;
2155 ci.rta_id = ci.rta_ts = ci.rta_tsage = 0;
2156 if (rt->peer) {
2157 ci.rta_id = rt->peer->ip_id_count;
2158 if (rt->peer->tcp_ts_stamp) {
2159 ci.rta_ts = rt->peer->tcp_ts;
2160 ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2161 }
2162 }
2163#ifdef CONFIG_IP_MROUTE
2164 eptr = (struct rtattr*)skb->tail;
2165#endif
2166 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2167 if (rt->key.iif) {
2168#ifdef CONFIG_IP_MROUTE
2169 u32 dst = rt->rt_dst;
2170
2171 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2172 ipv4_devconf.mc_forwarding) {
2173 int err = ipmr_get_route(skb, r, nowait);
2174 if (err <= 0) {
2175 if (!nowait) {
2176 if (err == 0)
2177 return 0;
2178 goto nlmsg_failure;
2179 } else {
2180 if (err == -EMSGSIZE)
2181 goto nlmsg_failure;
2182 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2183 }
2184 }
2185 } else
2186#endif
2187 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif);
2188 }
2189
2190 nlh->nlmsg_len = skb->tail - b;
2191 return skb->len;
2192
2193nlmsg_failure:
2194rtattr_failure:
2195 skb_trim(skb, b - skb->data);
2196 return -1;
2197}
2198
2199int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2200{
2201 struct rtattr **rta = arg;
2202 struct rtmsg *rtm = NLMSG_DATA(nlh);
2203 struct rtable *rt = NULL;
2204 u32 dst = 0;
2205 u32 src = 0;
2206 int iif = 0;
2207 int err = -ENOBUFS;
2208 struct sk_buff *skb;
2209
2210 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2211 if (!skb)
2212 goto out;
2213
2214
2215
2216
2217 skb->mac.raw = skb->data;
2218 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2219
2220 if (rta[RTA_SRC - 1])
2221 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2222 if (rta[RTA_DST - 1])
2223 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2224 if (rta[RTA_IIF - 1])
2225 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2226
2227 if (iif) {
2228 struct net_device *dev = __dev_get_by_index(iif);
2229 err = -ENODEV;
2230 if (!dev)
2231 goto out_free;
2232 skb->protocol = htons(ETH_P_IP);
2233 skb->dev = dev;
2234 local_bh_disable();
2235 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2236 local_bh_enable();
2237 rt = (struct rtable*)skb->dst;
2238 if (!err && rt->u.dst.error)
2239 err = -rt->u.dst.error;
2240 } else {
2241 int oif = 0;
2242 if (rta[RTA_OIF - 1])
2243 memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2244 err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif);
2245 }
2246 if (err)
2247 goto out_free;
2248
2249 skb->dst = &rt->u.dst;
2250 if (rtm->rtm_flags & RTM_F_NOTIFY)
2251 rt->rt_flags |= RTCF_NOTIFY;
2252
2253 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2254
2255 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2256 RTM_NEWROUTE, 0);
2257 if (!err)
2258 goto out_free;
2259 if (err < 0) {
2260 err = -EMSGSIZE;
2261 goto out_free;
2262 }
2263
2264 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2265 if (err > 0)
2266 err = 0;
2267out: return err;
2268
2269out_free:
2270 kfree_skb(skb);
2271 goto out;
2272}
2273
2274int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2275{
2276 struct rtable *rt;
2277 int h, s_h;
2278 int idx, s_idx;
2279
2280 s_h = cb->args[0];
2281 s_idx = idx = cb->args[1];
2282 for (h = 0; h <= rt_hash_mask; h++) {
2283 if (h < s_h) continue;
2284 if (h > s_h)
2285 s_idx = 0;
2286 read_lock_bh(&rt_hash_table[h].lock);
2287 for (rt = rt_hash_table[h].chain, idx = 0; rt;
2288 rt = rt->u.rt_next, idx++) {
2289 if (idx < s_idx)
2290 continue;
2291 skb->dst = dst_clone(&rt->u.dst);
2292 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2293 cb->nlh->nlmsg_seq,
2294 RTM_NEWROUTE, 1) <= 0) {
2295 dst_release(xchg(&skb->dst, NULL));
2296 read_unlock_bh(&rt_hash_table[h].lock);
2297 goto done;
2298 }
2299 dst_release(xchg(&skb->dst, NULL));
2300 }
2301 read_unlock_bh(&rt_hash_table[h].lock);
2302 }
2303
2304done:
2305 cb->args[0] = h;
2306 cb->args[1] = idx;
2307 return skb->len;
2308}
2309
2310void ip_rt_multicast_event(struct in_device *in_dev)
2311{
2312 rt_cache_flush(0);
2313}
2314
2315#ifdef CONFIG_SYSCTL
2316static int flush_delay;
2317
2318static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2319 struct file *filp, void *buffer,
2320 size_t *lenp)
2321{
2322 if (write) {
2323 proc_dointvec(ctl, write, filp, buffer, lenp);
2324 rt_cache_flush(flush_delay);
2325 return 0;
2326 }
2327
2328 return -EINVAL;
2329}
2330
2331static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, int *name,
2332 int nlen, void *oldval,
2333 size_t *oldlenp, void *newval,
2334 size_t newlen, void **context)
2335{
2336 int delay;
2337 if (newlen != sizeof(int))
2338 return -EINVAL;
2339 if (get_user(delay, (int *)newval))
2340 return -EFAULT;
2341 rt_cache_flush(delay);
2342 return 0;
2343}
2344
2345ctl_table ipv4_route_table[] = {
2346 {
2347 ctl_name: NET_IPV4_ROUTE_FLUSH,
2348 procname: "flush",
2349 data: &flush_delay,
2350 maxlen: sizeof(int),
2351 mode: 0644,
2352 proc_handler: &ipv4_sysctl_rtcache_flush,
2353 strategy: &ipv4_sysctl_rtcache_flush_strategy,
2354 },
2355 {
2356 ctl_name: NET_IPV4_ROUTE_MIN_DELAY,
2357 procname: "min_delay",
2358 data: &ip_rt_min_delay,
2359 maxlen: sizeof(int),
2360 mode: 0644,
2361 proc_handler: &proc_dointvec_jiffies,
2362 strategy: &sysctl_jiffies,
2363 },
2364 {
2365 ctl_name: NET_IPV4_ROUTE_MAX_DELAY,
2366 procname: "max_delay",
2367 data: &ip_rt_max_delay,
2368 maxlen: sizeof(int),
2369 mode: 0644,
2370 proc_handler: &proc_dointvec_jiffies,
2371 strategy: &sysctl_jiffies,
2372 },
2373 {
2374 ctl_name: NET_IPV4_ROUTE_GC_THRESH,
2375 procname: "gc_thresh",
2376 data: &ipv4_dst_ops.gc_thresh,
2377 maxlen: sizeof(int),
2378 mode: 0644,
2379 proc_handler: &proc_dointvec,
2380 },
2381 {
2382 ctl_name: NET_IPV4_ROUTE_MAX_SIZE,
2383 procname: "max_size",
2384 data: &ip_rt_max_size,
2385 maxlen: sizeof(int),
2386 mode: 0644,
2387 proc_handler: &proc_dointvec,
2388 },
2389 {
2390 ctl_name: NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2391 procname: "gc_min_interval",
2392 data: &ip_rt_gc_min_interval,
2393 maxlen: sizeof(int),
2394 mode: 0644,
2395 proc_handler: &proc_dointvec_jiffies,
2396 strategy: &sysctl_jiffies,
2397 },
2398 {
2399 ctl_name: NET_IPV4_ROUTE_GC_TIMEOUT,
2400 procname: "gc_timeout",
2401 data: &ip_rt_gc_timeout,
2402 maxlen: sizeof(int),
2403 mode: 0644,
2404 proc_handler: &proc_dointvec_jiffies,
2405 strategy: &sysctl_jiffies,
2406 },
2407 {
2408 ctl_name: NET_IPV4_ROUTE_GC_INTERVAL,
2409 procname: "gc_interval",
2410 data: &ip_rt_gc_interval,
2411 maxlen: sizeof(int),
2412 mode: 0644,
2413 proc_handler: &proc_dointvec_jiffies,
2414 strategy: &sysctl_jiffies,
2415 },
2416 {
2417 ctl_name: NET_IPV4_ROUTE_REDIRECT_LOAD,
2418 procname: "redirect_load",
2419 data: &ip_rt_redirect_load,
2420 maxlen: sizeof(int),
2421 mode: 0644,
2422 proc_handler: &proc_dointvec,
2423 },
2424 {
2425 ctl_name: NET_IPV4_ROUTE_REDIRECT_NUMBER,
2426 procname: "redirect_number",
2427 data: &ip_rt_redirect_number,
2428 maxlen: sizeof(int),
2429 mode: 0644,
2430 proc_handler: &proc_dointvec,
2431 },
2432 {
2433 ctl_name: NET_IPV4_ROUTE_REDIRECT_SILENCE,
2434 procname: "redirect_silence",
2435 data: &ip_rt_redirect_silence,
2436 maxlen: sizeof(int),
2437 mode: 0644,
2438 proc_handler: &proc_dointvec,
2439 },
2440 {
2441 ctl_name: NET_IPV4_ROUTE_ERROR_COST,
2442 procname: "error_cost",
2443 data: &ip_rt_error_cost,
2444 maxlen: sizeof(int),
2445 mode: 0644,
2446 proc_handler: &proc_dointvec,
2447 },
2448 {
2449 ctl_name: NET_IPV4_ROUTE_ERROR_BURST,
2450 procname: "error_burst",
2451 data: &ip_rt_error_burst,
2452 maxlen: sizeof(int),
2453 mode: 0644,
2454 proc_handler: &proc_dointvec,
2455 },
2456 {
2457 ctl_name: NET_IPV4_ROUTE_GC_ELASTICITY,
2458 procname: "gc_elasticity",
2459 data: &ip_rt_gc_elasticity,
2460 maxlen: sizeof(int),
2461 mode: 0644,
2462 proc_handler: &proc_dointvec,
2463 },
2464 {
2465 ctl_name: NET_IPV4_ROUTE_MTU_EXPIRES,
2466 procname: "mtu_expires",
2467 data: &ip_rt_mtu_expires,
2468 maxlen: sizeof(int),
2469 mode: 0644,
2470 proc_handler: &proc_dointvec_jiffies,
2471 strategy: &sysctl_jiffies,
2472 },
2473 {
2474 ctl_name: NET_IPV4_ROUTE_MIN_PMTU,
2475 procname: "min_pmtu",
2476 data: &ip_rt_min_pmtu,
2477 maxlen: sizeof(int),
2478 mode: 0644,
2479 proc_handler: &proc_dointvec,
2480 },
2481 {
2482 ctl_name: NET_IPV4_ROUTE_MIN_ADVMSS,
2483 procname: "min_adv_mss",
2484 data: &ip_rt_min_advmss,
2485 maxlen: sizeof(int),
2486 mode: 0644,
2487 proc_handler: &proc_dointvec,
2488 },
2489 {
2490 ctl_name: NET_IPV4_ROUTE_SECRET_INTERVAL,
2491 procname: "secret_interval",
2492 data: &ip_rt_secret_interval,
2493 maxlen: sizeof(int),
2494 mode: 0644,
2495 proc_handler: &proc_dointvec_jiffies,
2496 strategy: &sysctl_jiffies,
2497 },
2498 { 0 }
2499};
2500#endif
2501
2502#ifdef CONFIG_NET_CLS_ROUTE
2503struct ip_rt_acct *ip_rt_acct;
2504
2505
2506
2507
2508#define IP_RT_ACCT_CPU(i) (ip_rt_acct + cpu_logical_map(i) * 256)
2509
2510static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2511 int length, int *eof, void *data)
2512{
2513 unsigned int i;
2514
2515 if ((offset & 3) || (length & 3))
2516 return -EIO;
2517
2518 if (offset >= sizeof(struct ip_rt_acct) * 256) {
2519 *eof = 1;
2520 return 0;
2521 }
2522
2523 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2524 length = sizeof(struct ip_rt_acct) * 256 - offset;
2525 *eof = 1;
2526 }
2527
2528 offset /= sizeof(u32);
2529
2530 if (length > 0) {
2531 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2532 u32 *dst = (u32 *) buffer;
2533
2534
2535 *start = buffer;
2536 memcpy(dst, src, length);
2537
2538
2539 for (i = 1; i < smp_num_cpus; i++) {
2540 unsigned int j;
2541
2542 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2543
2544 for (j = 0; j < length/4; j++)
2545 dst[j] += src[j];
2546 }
2547 }
2548 return length;
2549}
2550#endif
2551
2552void __init ip_rt_init(void)
2553{
2554 int i, order, goal;
2555
2556 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2557 (jiffies ^ (jiffies >> 7)));
2558
2559#ifdef CONFIG_NET_CLS_ROUTE
2560 for (order = 0;
2561 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2562 ;
2563 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2564 if (!ip_rt_acct)
2565 panic("IP: failed to allocate ip_rt_acct\n");
2566 memset(ip_rt_acct, 0, PAGE_SIZE << order);
2567#endif
2568
2569 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
2570 sizeof(struct rtable),
2571 0, SLAB_HWCACHE_ALIGN,
2572 NULL, NULL);
2573
2574 if (!ipv4_dst_ops.kmem_cachep)
2575 panic("IP: failed to allocate ip_dst_cache\n");
2576
2577 goal = num_physpages >> (26 - PAGE_SHIFT);
2578
2579 for (order = 0; (1UL << order) < goal; order++)
2580 ;
2581
2582 do {
2583 rt_hash_mask = (1UL << order) * PAGE_SIZE /
2584 sizeof(struct rt_hash_bucket);
2585 while (rt_hash_mask & (rt_hash_mask - 1))
2586 rt_hash_mask--;
2587 rt_hash_table = (struct rt_hash_bucket *)
2588 __get_free_pages(GFP_ATOMIC, order);
2589 } while (rt_hash_table == NULL && --order > 0);
2590
2591 if (!rt_hash_table)
2592 panic("Failed to allocate IP route cache hash table\n");
2593
2594 printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
2595 rt_hash_mask,
2596 (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
2597
2598 for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
2599 ;
2600
2601 rt_hash_mask--;
2602 for (i = 0; i <= rt_hash_mask; i++) {
2603 rt_hash_table[i].lock = RW_LOCK_UNLOCKED;
2604 rt_hash_table[i].chain = NULL;
2605 }
2606
2607 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2608 ip_rt_max_size = (rt_hash_mask + 1) * 16;
2609
2610 devinet_init();
2611 ip_fib_init();
2612
2613 rt_flush_timer.function = rt_run_flush;
2614 rt_periodic_timer.function = rt_check_expire;
2615 rt_secret_timer.function = rt_secret_rebuild;
2616
2617
2618
2619
2620 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
2621 ip_rt_gc_interval;
2622 add_timer(&rt_periodic_timer);
2623
2624 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2625 ip_rt_secret_interval;
2626 add_timer(&rt_secret_timer);
2627
2628 proc_net_create ("rt_cache", 0, rt_cache_get_info);
2629 create_proc_info_entry ("rt_cache", 0, proc_net_stat,
2630 rt_cache_stat_get_info);
2631#ifdef CONFIG_NET_CLS_ROUTE
2632 create_proc_read_entry("net/rt_acct", 0, 0, ip_rt_acct_read, NULL);
2633#endif
2634}
2635