1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62#include <linux/config.h>
63#include <asm/uaccess.h>
64#include <asm/system.h>
65#include <asm/bitops.h>
66#include <linux/types.h>
67#include <linux/kernel.h>
68#include <linux/sched.h>
69#include <linux/mm.h>
70#include <linux/string.h>
71#include <linux/socket.h>
72#include <linux/sockios.h>
73#include <linux/errno.h>
74#include <linux/in.h>
75#include <linux/inet.h>
76#include <linux/netdevice.h>
77#include <linux/proc_fs.h>
78#include <linux/init.h>
79#include <linux/skbuff.h>
80#include <linux/rtnetlink.h>
81#include <linux/inetdevice.h>
82#include <linux/igmp.h>
83#include <linux/pkt_sched.h>
84#include <linux/mroute.h>
85#include <net/protocol.h>
86#include <net/ip.h>
87#include <net/route.h>
88#include <net/sock.h>
89#include <net/ip_fib.h>
90#include <net/arp.h>
91#include <net/tcp.h>
92#include <net/icmp.h>
93#ifdef CONFIG_SYSCTL
94#include <linux/sysctl.h>
95#endif
96
97#define IP_MAX_MTU 0xFFF0
98
99#define RT_GC_TIMEOUT (300*HZ)
100
101int ip_rt_min_delay = 2*HZ;
102int ip_rt_max_delay = 10*HZ;
103int ip_rt_gc_thresh = RT_HASH_DIVISOR;
104int ip_rt_max_size = RT_HASH_DIVISOR*16;
105int ip_rt_gc_timeout = RT_GC_TIMEOUT;
106int ip_rt_gc_interval = 60*HZ;
107int ip_rt_gc_min_interval = 5*HZ;
108int ip_rt_redirect_number = 9;
109int ip_rt_redirect_load = HZ/50;
110int ip_rt_redirect_silence = ((HZ/50) << (9+1));
111int ip_rt_error_cost = HZ;
112int ip_rt_error_burst = 5*HZ;
113int ip_rt_gc_elasticity = 8;
114int ip_rt_mtu_expires = 10*60*HZ;
115
116static unsigned long rt_deadline = 0;
117
118#define RTprint(a...) printk(KERN_DEBUG a)
119
120static void rt_run_flush(unsigned long dummy);
121
122static struct timer_list rt_flush_timer =
123 { NULL, NULL, 0, 0L, rt_run_flush };
124static struct timer_list rt_periodic_timer =
125 { NULL, NULL, 0, 0L, NULL };
126
127
128
129
130
131static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32);
132static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst,
133 struct sk_buff *);
134static struct dst_entry * ipv4_negative_advice(struct dst_entry *);
135static void ipv4_link_failure(struct sk_buff *skb);
136static int rt_garbage_collect(void);
137
138
139struct dst_ops ipv4_dst_ops =
140{
141 AF_INET,
142 __constant_htons(ETH_P_IP),
143 RT_HASH_DIVISOR,
144
145 rt_garbage_collect,
146 ipv4_dst_check,
147 ipv4_dst_reroute,
148 NULL,
149 ipv4_negative_advice,
150 ipv4_link_failure,
151};
152
153__u8 ip_tos2prio[16] = {
154 TC_PRIO_BESTEFFORT,
155 TC_PRIO_FILLER,
156 TC_PRIO_BESTEFFORT,
157 TC_PRIO_FILLER,
158 TC_PRIO_BULK,
159 TC_PRIO_FILLER,
160 TC_PRIO_BULK,
161 TC_PRIO_FILLER,
162 TC_PRIO_INTERACTIVE,
163 TC_PRIO_FILLER,
164 TC_PRIO_INTERACTIVE,
165 TC_PRIO_FILLER,
166 TC_PRIO_INTERACTIVE_BULK,
167 TC_PRIO_FILLER,
168 TC_PRIO_INTERACTIVE_BULK,
169 TC_PRIO_FILLER
170};
171
172
173
174
175
176
177struct rtable *rt_hash_table[RT_HASH_DIVISOR];
178
179static int rt_intern_hash(unsigned hash, struct rtable * rth, struct rtable ** res);
180
181static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos)
182{
183 unsigned hash = ((daddr&0xF0F0F0F0)>>4)|((daddr&0x0F0F0F0F)<<4);
184 hash = hash^saddr^tos;
185 hash = hash^(hash>>16);
186 return (hash^(hash>>8)) & 0xFF;
187}
188
189#ifdef CONFIG_PROC_FS
190
191static int rt_cache_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
192{
193 int len=0;
194 off_t pos=0;
195 char temp[129];
196 struct rtable *r;
197 int i;
198
199 pos = 128;
200
201 if (offset<128) {
202 sprintf(buffer,"%-127s\n", "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\tMetric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\tHHUptod\tSpecDst");
203 len = 128;
204 }
205
206
207 start_bh_atomic();
208
209 for (i = 0; i<RT_HASH_DIVISOR; i++) {
210 for (r = rt_hash_table[i]; r; r = r->u.rt_next) {
211
212
213
214 pos += 128;
215
216 if (pos <= offset) {
217 len = 0;
218 continue;
219 }
220 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
221 r->u.dst.dev ? r->u.dst.dev->name : "*",
222 (unsigned long)r->rt_dst,
223 (unsigned long)r->rt_gateway,
224 r->rt_flags,
225 atomic_read(&r->u.dst.use),
226 atomic_read(&r->u.dst.refcnt),
227 0,
228 (unsigned long)r->rt_src, (int)r->u.dst.pmtu,
229 r->u.dst.window,
230 (int)r->u.dst.rtt, r->key.tos,
231 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
232 r->u.dst.hh ? (r->u.dst.hh->hh_output == dev_queue_xmit) : 0,
233 r->rt_spec_dst);
234 sprintf(buffer+len,"%-127s\n",temp);
235 len += 128;
236 if (pos >= offset+length)
237 goto done;
238 }
239 }
240
241done:
242 end_bh_atomic();
243
244 *start = buffer+len-(pos-offset);
245 len = pos-offset;
246 if (len>length)
247 len = length;
248 return len;
249}
250#endif
251
252static __inline__ void rt_free(struct rtable *rt)
253{
254 dst_free(&rt->u.dst);
255}
256
257static __inline__ void rt_drop(struct rtable *rt)
258{
259 ip_rt_put(rt);
260 dst_free(&rt->u.dst);
261}
262
263static __inline__ int rt_fast_clean(struct rtable *rth)
264{
265
266
267 return ((rth->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST))
268 && rth->key.iif && rth->u.rt_next);
269}
270
271static __inline__ int rt_valuable(struct rtable *rth)
272{
273 return ((rth->rt_flags&(RTCF_REDIRECTED|RTCF_NOTIFY))
274 || rth->u.dst.expires);
275}
276
277static __inline__ int rt_may_expire(struct rtable *rth, int tmo1, int tmo2)
278{
279 int age;
280
281 if (atomic_read(&rth->u.dst.use))
282 return 0;
283
284 age = jiffies - rth->u.dst.lastuse;
285 if (age <= tmo1 && !rt_fast_clean(rth))
286 return 0;
287 if (age <= tmo2 && rt_valuable(rth))
288 return 0;
289 return 1;
290}
291
292static void rt_check_expire(unsigned long dummy)
293{
294 int i;
295 static int rover;
296 struct rtable *rth, **rthp;
297 unsigned long now = jiffies;
298
299 for (i=0; i<RT_HASH_DIVISOR/5; i++) {
300 unsigned tmo = ip_rt_gc_timeout;
301
302 rover = (rover + 1) & (RT_HASH_DIVISOR-1);
303 rthp = &rt_hash_table[rover];
304
305 while ((rth = *rthp) != NULL) {
306 if (rth->u.dst.expires) {
307
308 if ((long)(now - rth->u.dst.expires) < tmo) {
309 tmo >>= 1;
310 rthp = &rth->u.rt_next;
311 continue;
312 }
313 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
314 tmo >>= 1;
315 rthp = &rth->u.rt_next;
316 continue;
317 }
318
319
320
321
322 *rthp = rth->u.rt_next;
323 rt_free(rth);
324 }
325
326
327 if ((jiffies - now) > 0)
328 break;
329 }
330 rt_periodic_timer.expires = now + ip_rt_gc_interval;
331 add_timer(&rt_periodic_timer);
332}
333
334static void rt_run_flush(unsigned long dummy)
335{
336 int i;
337 struct rtable * rth, * next;
338
339 rt_deadline = 0;
340
341 start_bh_atomic();
342 for (i=0; i<RT_HASH_DIVISOR; i++) {
343 if ((rth = xchg(&rt_hash_table[i], NULL)) == NULL)
344 continue;
345 end_bh_atomic();
346
347 for (; rth; rth=next) {
348 next = rth->u.rt_next;
349 rth->u.rt_next = NULL;
350 rt_free(rth);
351 }
352
353 start_bh_atomic();
354 }
355 end_bh_atomic();
356}
357
358void rt_cache_flush(int delay)
359{
360 unsigned long now = jiffies;
361 int user_mode = !in_interrupt();
362
363 if (delay < 0)
364 delay = ip_rt_min_delay;
365
366 start_bh_atomic();
367
368 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
369 long tmo = (long)(rt_deadline - now);
370
371
372
373
374
375
376
377
378 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
379 tmo = 0;
380
381 if (delay > tmo)
382 delay = tmo;
383 }
384
385 if (delay <= 0) {
386 end_bh_atomic();
387 rt_run_flush(0);
388 return;
389 }
390
391 if (rt_deadline == 0)
392 rt_deadline = now + ip_rt_max_delay;
393
394 rt_flush_timer.expires = now + delay;
395 add_timer(&rt_flush_timer);
396 end_bh_atomic();
397}
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412static int rt_garbage_collect(void)
413{
414 static unsigned expire = RT_GC_TIMEOUT;
415 static unsigned long last_gc;
416 static int rover;
417 static int equilibrium;
418 struct rtable *rth, **rthp;
419 unsigned long now = jiffies;
420 int goal;
421
422
423
424
425
426 if (now - last_gc < ip_rt_gc_min_interval &&
427 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
428 return 0;
429
430
431 goal = atomic_read(&ipv4_dst_ops.entries) - RT_HASH_DIVISOR*ip_rt_gc_elasticity;
432 if (goal <= 0) {
433 if (equilibrium < ipv4_dst_ops.gc_thresh)
434 equilibrium = ipv4_dst_ops.gc_thresh;
435 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
436 if (goal > 0) {
437 equilibrium += min(goal/2, RT_HASH_DIVISOR);
438 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
439 }
440 } else {
441
442
443
444 goal = max(goal/2, RT_HASH_DIVISOR);
445 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
446 }
447
448 if (now - last_gc >= ip_rt_gc_min_interval)
449 last_gc = now;
450
451 if (goal <= 0) {
452 equilibrium += goal;
453 goto work_done;
454 }
455
456 do {
457 int i, k;
458
459 start_bh_atomic();
460 for (i=0, k=rover; i<RT_HASH_DIVISOR; i++) {
461 unsigned tmo = expire;
462
463 k = (k + 1) & (RT_HASH_DIVISOR-1);
464 rthp = &rt_hash_table[k];
465 while ((rth = *rthp) != NULL) {
466 if (!rt_may_expire(rth, tmo, expire)) {
467 tmo >>= 1;
468 rthp = &rth->u.rt_next;
469 continue;
470 }
471 *rthp = rth->u.rt_next;
472 rth->u.rt_next = NULL;
473 rt_free(rth);
474 goal--;
475 }
476 if (goal <= 0)
477 break;
478 }
479 rover = k;
480 end_bh_atomic();
481
482 if (goal <= 0)
483 goto work_done;
484
485
486
487
488
489
490
491
492
493
494 if (expire == 0)
495 break;
496
497 expire >>= 1;
498#if RT_CACHE_DEBUG >= 2
499 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, i);
500#endif
501
502 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
503 return 0;
504 } while (!in_interrupt() && jiffies - now < 1);
505
506 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
507 return 0;
508 if (net_ratelimit())
509 printk("dst cache overflow\n");
510 return 1;
511
512work_done:
513 expire += ip_rt_gc_min_interval;
514 if (expire > ip_rt_gc_timeout ||
515 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
516 expire = ip_rt_gc_timeout;
517#if RT_CACHE_DEBUG >= 2
518 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, rover);
519#endif
520 return 0;
521}
522
523static int rt_intern_hash(unsigned hash, struct rtable * rt, struct rtable ** rp)
524{
525 struct rtable *rth, **rthp;
526 unsigned long now = jiffies;
527 int attempts = !in_interrupt();
528
529restart:
530 start_bh_atomic();
531
532 rthp = &rt_hash_table[hash];
533
534 while ((rth = *rthp) != NULL) {
535 if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
536
537 *rthp = rth->u.rt_next;
538 rth->u.rt_next = rt_hash_table[hash];
539 rt_hash_table[hash] = rth;
540
541 atomic_inc(&rth->u.dst.refcnt);
542 atomic_inc(&rth->u.dst.use);
543 rth->u.dst.lastuse = now;
544 end_bh_atomic();
545
546 rt_drop(rt);
547 *rp = rth;
548 return 0;
549 }
550
551 rthp = &rth->u.rt_next;
552 }
553
554
555
556
557 if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
558 if (!arp_bind_neighbour(&rt->u.dst)) {
559 end_bh_atomic();
560
561
562
563
564
565 if (attempts-- > 0) {
566 int saved_elasticity = ip_rt_gc_elasticity;
567 ip_rt_gc_elasticity = 1;
568 rt_garbage_collect();
569 ip_rt_gc_elasticity = saved_elasticity;
570 goto restart;
571 }
572
573 rt_drop(rt);
574 if (net_ratelimit())
575 printk("neighbour table overflow\n");
576 return -ENOBUFS;
577 }
578 }
579
580 rt->u.rt_next = rt_hash_table[hash];
581#if RT_CACHE_DEBUG >= 2
582 if (rt->u.rt_next) {
583 struct rtable * trt;
584 printk("rt_cache @%02x: %08x", hash, rt->rt_dst);
585 for (trt=rt->u.rt_next; trt; trt=trt->u.rt_next)
586 printk(" . %08x", trt->rt_dst);
587 printk("\n");
588 }
589#endif
590 rt_hash_table[hash] = rt;
591 end_bh_atomic();
592 *rp = rt;
593 return 0;
594}
595
596void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
597 u32 saddr, u8 tos, struct device *dev)
598{
599 int i, k;
600 struct in_device *in_dev = dev->ip_ptr;
601 struct rtable *rth, **rthp;
602 u32 skeys[2] = { saddr, 0 };
603 int ikeys[2] = { dev->ifindex, 0 };
604
605 tos &= IPTOS_TOS_MASK;
606
607 if (!in_dev)
608 return;
609
610 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
611 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
612 goto reject_redirect;
613
614 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
615 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
616 goto reject_redirect;
617 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
618 goto reject_redirect;
619 } else {
620 if (inet_addr_type(new_gw) != RTN_UNICAST)
621 goto reject_redirect;
622 }
623
624 for (i=0; i<2; i++) {
625 for (k=0; k<2; k++) {
626 unsigned hash = rt_hash_code(daddr, skeys[i]^(ikeys[k]<<5), tos);
627
628 rthp=&rt_hash_table[hash];
629
630 while ( (rth = *rthp) != NULL) {
631 struct rtable *rt;
632
633 if (rth->key.dst != daddr ||
634 rth->key.src != skeys[i] ||
635 rth->key.tos != tos ||
636 rth->key.oif != ikeys[k] ||
637 rth->key.iif != 0) {
638 rthp = &rth->u.rt_next;
639 continue;
640 }
641
642 if (rth->rt_dst != daddr ||
643 rth->rt_src != saddr ||
644 rth->u.dst.error ||
645 rth->rt_gateway != old_gw ||
646 rth->u.dst.dev != dev)
647 break;
648
649 dst_clone(&rth->u.dst);
650
651 rt = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
652 if (rt == NULL) {
653 ip_rt_put(rth);
654 return;
655 }
656
657
658
659
660 *rt = *rth;
661 atomic_set(&rt->u.dst.refcnt, 1);
662 atomic_set(&rt->u.dst.use, 1);
663 rt->u.dst.lastuse = jiffies;
664 rt->u.dst.neighbour = NULL;
665 rt->u.dst.hh = NULL;
666
667 rt->rt_flags |= RTCF_REDIRECTED;
668
669
670 rt->rt_gateway = new_gw;
671
672
673 dst_confirm(&rth->u.dst);
674
675 if (!arp_bind_neighbour(&rt->u.dst) ||
676 !(rt->u.dst.neighbour->nud_state&NUD_VALID)) {
677 if (rt->u.dst.neighbour)
678 neigh_event_send(rt->u.dst.neighbour, NULL);
679 ip_rt_put(rth);
680 rt_drop(rt);
681 break;
682 }
683
684 *rthp = rth->u.rt_next;
685 if (!rt_intern_hash(hash, rt, &rt))
686 ip_rt_put(rt);
687 rt_drop(rth);
688 break;
689 }
690 }
691 }
692 return;
693
694reject_redirect:
695#ifdef CONFIG_IP_ROUTE_VERBOSE
696 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
697 printk(KERN_INFO "Redirect from %lX/%s to %lX ignored."
698 "Path = %lX -> %lX, tos %02x\n",
699 ntohl(old_gw), dev->name, ntohl(new_gw),
700 ntohl(saddr), ntohl(daddr), tos);
701#endif
702}
703
704static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
705{
706 struct rtable *rt = (struct rtable*)dst;
707
708 if (rt != NULL) {
709 if (dst->obsolete) {
710 ip_rt_put(rt);
711 return NULL;
712 }
713 if ((rt->rt_flags&RTCF_REDIRECTED) || rt->u.dst.expires) {
714 unsigned hash = rt_hash_code(rt->key.dst, rt->key.src^(rt->key.oif<<5), rt->key.tos);
715 struct rtable **rthp;
716#if RT_CACHE_DEBUG >= 1
717 printk(KERN_DEBUG "ip_rt_advice: redirect to %d.%d.%d.%d/%02x dropped\n", NIPQUAD(rt->rt_dst), rt->key.tos);
718#endif
719 start_bh_atomic();
720 ip_rt_put(rt);
721 for (rthp = &rt_hash_table[hash]; *rthp; rthp = &(*rthp)->u.rt_next) {
722 if (*rthp == rt) {
723 *rthp = rt->u.rt_next;
724 rt_free(rt);
725 break;
726 }
727 }
728 end_bh_atomic();
729 return NULL;
730 }
731 }
732 return dst;
733}
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751void ip_rt_send_redirect(struct sk_buff *skb)
752{
753 struct rtable *rt = (struct rtable*)skb->dst;
754 struct in_device *in_dev = (struct in_device*)rt->u.dst.dev->ip_ptr;
755
756 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev))
757 return;
758
759
760
761
762 if (jiffies - rt->u.dst.rate_last > ip_rt_redirect_silence)
763 rt->u.dst.rate_tokens = 0;
764
765
766
767
768 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
769 rt->u.dst.rate_last = jiffies;
770 return;
771 }
772
773
774
775
776 if (jiffies - rt->u.dst.rate_last > (ip_rt_redirect_load<<rt->u.dst.rate_tokens)) {
777 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
778 rt->u.dst.rate_last = jiffies;
779 ++rt->u.dst.rate_tokens;
780#ifdef CONFIG_IP_ROUTE_VERBOSE
781 if (IN_DEV_LOG_MARTIANS(in_dev) &&
782 rt->u.dst.rate_tokens == ip_rt_redirect_number && net_ratelimit())
783 printk(KERN_WARNING "host %08x/if%d ignores redirects for %08x to %08x.\n",
784 rt->rt_src, rt->rt_iif, rt->rt_dst, rt->rt_gateway);
785#endif
786 }
787}
788
789static int ip_error(struct sk_buff *skb)
790{
791 struct rtable *rt = (struct rtable*)skb->dst;
792 unsigned long now;
793 int code;
794
795 switch (rt->u.dst.error) {
796 case EINVAL:
797 default:
798 kfree_skb(skb);
799 return 0;
800 case EHOSTUNREACH:
801 code = ICMP_HOST_UNREACH;
802 break;
803 case ENETUNREACH:
804 code = ICMP_NET_UNREACH;
805 break;
806 case EACCES:
807 code = ICMP_PKT_FILTERED;
808 break;
809 }
810
811 now = jiffies;
812 if ((rt->u.dst.rate_tokens += (now - rt->u.dst.rate_last)) > ip_rt_error_burst)
813 rt->u.dst.rate_tokens = ip_rt_error_burst;
814 rt->u.dst.rate_last = now;
815 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
816 rt->u.dst.rate_tokens -= ip_rt_error_cost;
817 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
818 }
819
820 kfree_skb(skb);
821 return 0;
822}
823
824
825
826
827
828
829static unsigned short mtu_plateau[] =
830{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
831
832static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
833{
834 int i;
835
836 for (i = 0; i < sizeof(mtu_plateau)/sizeof(mtu_plateau[0]); i++)
837 if (old_mtu > mtu_plateau[i])
838 return mtu_plateau[i];
839 return 68;
840}
841
842unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
843{
844 int i;
845 unsigned short old_mtu = ntohs(iph->tot_len);
846 struct rtable *rth;
847 u32 skeys[2] = { iph->saddr, 0, };
848 u32 daddr = iph->daddr;
849 u8 tos = iph->tos & IPTOS_TOS_MASK;
850 unsigned short est_mtu = 0;
851
852 if (ipv4_config.no_pmtu_disc)
853 return 0;
854
855 for (i=0; i<2; i++) {
856 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
857
858 for (rth = rt_hash_table[hash]; rth; rth = rth->u.rt_next) {
859 if (rth->key.dst == daddr &&
860 rth->key.src == skeys[i] &&
861 rth->rt_dst == daddr &&
862 rth->rt_src == iph->saddr &&
863 rth->key.tos == tos &&
864 rth->key.iif == 0 &&
865 !(rth->u.dst.mxlock&(1<<RTAX_MTU))) {
866 unsigned short mtu = new_mtu;
867
868 if (new_mtu < 68 || new_mtu >= old_mtu) {
869
870
871 if (mtu == 0 && old_mtu >= rth->u.dst.pmtu &&
872 old_mtu >= 68 + (iph->ihl<<2))
873 old_mtu -= iph->ihl<<2;
874
875 mtu = guess_mtu(old_mtu);
876 }
877 if (mtu <= rth->u.dst.pmtu) {
878 if (mtu < rth->u.dst.pmtu) {
879 dst_confirm(&rth->u.dst);
880 rth->u.dst.pmtu = mtu;
881 dst_set_expires(&rth->u.dst, ip_rt_mtu_expires);
882 }
883 est_mtu = mtu;
884 }
885 }
886 }
887 }
888 return est_mtu;
889}
890
891static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32 cookie)
892{
893 dst_release(dst);
894 return NULL;
895}
896
897static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst,
898 struct sk_buff *skb)
899{
900 return NULL;
901}
902
903static void ipv4_link_failure(struct sk_buff *skb)
904{
905 struct rtable *rt;
906
907 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
908
909 rt = (struct rtable *) skb->dst;
910 if (rt)
911 dst_set_expires(&rt->u.dst, 0);
912}
913
914static int ip_rt_bug(struct sk_buff *skb)
915{
916 printk(KERN_DEBUG "ip_rt_bug: %08x -> %08x, %s\n", skb->nh.iph->saddr,
917 skb->nh.iph->daddr, skb->dev ? skb->dev->name : "?");
918 kfree_skb(skb);
919 return 0;
920}
921
922
923
924
925
926
927
928
929
930
931void ip_rt_get_source(u8 *addr, struct rtable *rt)
932{
933 u32 src;
934 struct fib_result res;
935
936 if (rt->key.iif == 0)
937 src = rt->rt_src;
938 else if (fib_lookup(&rt->key, &res) == 0)
939 src = FIB_RES_PREFSRC(res);
940 else
941 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE);
942 memcpy(addr, &src, 4);
943}
944
945#ifdef CONFIG_NET_CLS_ROUTE
946static void set_class_tag(struct rtable *rt, u32 tag)
947{
948 if (!(rt->u.dst.tclassid&0xFFFF))
949 rt->u.dst.tclassid |= tag&0xFFFF;
950 if (!(rt->u.dst.tclassid&0xFFFF0000))
951 rt->u.dst.tclassid |= tag&0xFFFF0000;
952}
953#endif
954
955static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
956{
957 struct fib_info *fi = res->fi;
958
959 if (fi) {
960 if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
961 rt->rt_gateway = FIB_RES_GW(*res);
962 rt->u.dst.mxlock = fi->fib_metrics[RTAX_LOCK-1];
963 rt->u.dst.pmtu = fi->fib_mtu;
964 if (fi->fib_mtu == 0) {
965 rt->u.dst.pmtu = rt->u.dst.dev->mtu;
966 if (rt->u.dst.pmtu > IP_MAX_MTU)
967 rt->u.dst.pmtu = IP_MAX_MTU;
968 if (rt->u.dst.mxlock&(1<<RTAX_MTU) &&
969 rt->rt_gateway != rt->rt_dst &&
970 rt->u.dst.pmtu > 576)
971 rt->u.dst.pmtu = 576;
972 }
973 rt->u.dst.window= fi->fib_window ? : 0;
974 rt->u.dst.rtt = fi->fib_rtt ? : TCP_TIMEOUT_INIT;
975#ifdef CONFIG_NET_CLS_ROUTE
976 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
977#endif
978 } else {
979 rt->u.dst.pmtu = rt->u.dst.dev->mtu;
980 if (rt->u.dst.pmtu > IP_MAX_MTU)
981 rt->u.dst.pmtu = IP_MAX_MTU;
982 rt->u.dst.window= 0;
983 rt->u.dst.rtt = TCP_TIMEOUT_INIT;
984 }
985#ifdef CONFIG_NET_CLS_ROUTE
986#ifdef CONFIG_IP_MULTIPLE_TABLES
987 set_class_tag(rt, fib_rules_tclass(res));
988#endif
989 set_class_tag(rt, itag);
990#endif
991 rt->rt_type = res->type;
992}
993
994static int
995ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
996 u8 tos, struct device *dev, int our)
997{
998 unsigned hash;
999 struct rtable *rth;
1000 u32 spec_dst;
1001 struct in_device *in_dev = dev->ip_ptr;
1002 u32 itag = 0;
1003
1004
1005
1006 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1007 in_dev == NULL || skb->protocol != __constant_htons(ETH_P_IP))
1008 return -EINVAL;
1009
1010 if (ZERONET(saddr)) {
1011 if (!LOCAL_MCAST(daddr))
1012 return -EINVAL;
1013 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1014 } else if (fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag) < 0)
1015 return -EINVAL;
1016
1017 rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
1018 if (!rth)
1019 return -ENOBUFS;
1020
1021 rth->u.dst.output= ip_rt_bug;
1022
1023 atomic_set(&rth->u.dst.use, 1);
1024 rth->key.dst = daddr;
1025 rth->rt_dst = daddr;
1026 rth->key.tos = tos;
1027#ifdef CONFIG_IP_ROUTE_FWMARK
1028 rth->key.fwmark = skb->fwmark;
1029#endif
1030 rth->key.src = saddr;
1031 rth->rt_src = saddr;
1032#ifdef CONFIG_IP_ROUTE_NAT
1033 rth->rt_dst_map = daddr;
1034 rth->rt_src_map = saddr;
1035#endif
1036#ifdef CONFIG_NET_CLS_ROUTE
1037 rth->u.dst.tclassid = itag;
1038#endif
1039 rth->rt_iif =
1040 rth->key.iif = dev->ifindex;
1041 rth->u.dst.dev = &loopback_dev;
1042 rth->key.oif = 0;
1043 rth->rt_gateway = daddr;
1044 rth->rt_spec_dst= spec_dst;
1045 rth->rt_type = RTN_MULTICAST;
1046 rth->rt_flags = RTCF_MULTICAST;
1047 if (our) {
1048 rth->u.dst.input= ip_local_deliver;
1049 rth->rt_flags |= RTCF_LOCAL;
1050 }
1051
1052#ifdef CONFIG_IP_MROUTE
1053 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1054 rth->u.dst.input = ip_mr_input;
1055#endif
1056
1057 hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos);
1058 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1059}
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1072 u8 tos, struct device *dev)
1073{
1074 struct rt_key key;
1075 struct fib_result res;
1076 struct in_device *in_dev = dev->ip_ptr;
1077 struct in_device *out_dev;
1078 unsigned flags = 0;
1079 u32 itag = 0;
1080 struct rtable * rth;
1081 unsigned hash;
1082 u32 spec_dst;
1083 int err = -EINVAL;
1084
1085
1086
1087
1088
1089 if (!in_dev)
1090 return -EINVAL;
1091
1092 key.dst = daddr;
1093 key.src = saddr;
1094 key.tos = tos;
1095#ifdef CONFIG_IP_ROUTE_FWMARK
1096 key.fwmark = skb->fwmark;
1097#endif
1098 key.iif = dev->ifindex;
1099 key.oif = 0;
1100 key.scope = RT_SCOPE_UNIVERSE;
1101
1102 hash = rt_hash_code(daddr, saddr^(key.iif<<5), tos);
1103
1104
1105
1106
1107
1108 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1109 goto martian_source;
1110
1111 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1112 goto brd_input;
1113
1114
1115
1116
1117 if (ZERONET(saddr))
1118 goto martian_source;
1119
1120 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1121 goto martian_destination;
1122
1123
1124
1125
1126 if ((err = fib_lookup(&key, &res))) {
1127 if (!IN_DEV_FORWARD(in_dev))
1128 return -EINVAL;
1129 goto no_route;
1130 }
1131
1132#ifdef CONFIG_IP_ROUTE_NAT
1133
1134
1135
1136
1137 if (1) {
1138 u32 src_map = saddr;
1139 if (res.r)
1140 src_map = fib_rules_policy(saddr, &res, &flags);
1141
1142 if (res.type == RTN_NAT) {
1143 key.dst = fib_rules_map_destination(daddr, &res);
1144 if (fib_lookup(&key, &res) || res.type != RTN_UNICAST)
1145 return -EINVAL;
1146 flags |= RTCF_DNAT;
1147 }
1148 key.src = src_map;
1149 }
1150#endif
1151
1152 if (res.type == RTN_BROADCAST)
1153 goto brd_input;
1154
1155 if (res.type == RTN_LOCAL) {
1156 int result;
1157 result = fib_validate_source(saddr, daddr, tos, loopback_dev.ifindex,
1158 dev, &spec_dst, &itag);
1159 if (result < 0)
1160 goto martian_source;
1161 if (result)
1162 flags |= RTCF_DIRECTSRC;
1163 spec_dst = daddr;
1164 goto local_input;
1165 }
1166
1167 if (!IN_DEV_FORWARD(in_dev))
1168 return -EINVAL;
1169 if (res.type != RTN_UNICAST)
1170 goto martian_destination;
1171
1172#ifdef CONFIG_IP_ROUTE_MULTIPATH
1173 if (res.fi->fib_nhs > 1 && key.oif == 0)
1174 fib_select_multipath(&key, &res);
1175#endif
1176 out_dev = FIB_RES_DEV(res)->ip_ptr;
1177 if (out_dev == NULL) {
1178 if (net_ratelimit())
1179 printk(KERN_CRIT "Bug in ip_route_input_slow(). Please, report\n");
1180 return -EINVAL;
1181 }
1182
1183 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, &spec_dst, &itag);
1184 if (err < 0)
1185 goto martian_source;
1186
1187 if (err)
1188 flags |= RTCF_DIRECTSRC;
1189
1190 if (out_dev == in_dev && err && !(flags&(RTCF_NAT|RTCF_MASQ)) &&
1191 (IN_DEV_SHARED_MEDIA(out_dev)
1192 || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1193 flags |= RTCF_DOREDIRECT;
1194
1195 if (skb->protocol != __constant_htons(ETH_P_IP)) {
1196
1197
1198
1199 if (out_dev == in_dev && !(flags&RTCF_DNAT))
1200 return -EINVAL;
1201 }
1202
1203 rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
1204 if (!rth)
1205 return -ENOBUFS;
1206
1207 atomic_set(&rth->u.dst.use, 1);
1208 rth->key.dst = daddr;
1209 rth->rt_dst = daddr;
1210 rth->key.tos = tos;
1211#ifdef CONFIG_IP_ROUTE_FWMARK
1212 rth->key.fwmark = skb->fwmark;
1213#endif
1214 rth->key.src = saddr;
1215 rth->rt_src = saddr;
1216 rth->rt_gateway = daddr;
1217#ifdef CONFIG_IP_ROUTE_NAT
1218 rth->rt_src_map = key.src;
1219 rth->rt_dst_map = key.dst;
1220 if (flags&RTCF_DNAT)
1221 rth->rt_gateway = key.dst;
1222#endif
1223 rth->rt_iif =
1224 rth->key.iif = dev->ifindex;
1225 rth->u.dst.dev = out_dev->dev;
1226 rth->key.oif = 0;
1227 rth->rt_spec_dst= spec_dst;
1228
1229 rth->u.dst.input = ip_forward;
1230 rth->u.dst.output = ip_output;
1231
1232 rt_set_nexthop(rth, &res, itag);
1233
1234 rth->rt_flags = flags;
1235
1236#ifdef CONFIG_NET_FASTROUTE
1237 if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
1238 struct device *odev = rth->u.dst.dev;
1239 if (odev != dev &&
1240 dev->accept_fastpath &&
1241 odev->mtu >= dev->mtu &&
1242 dev->accept_fastpath(dev, &rth->u.dst) == 0)
1243 rth->rt_flags |= RTCF_FAST;
1244 }
1245#endif
1246
1247 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1248
1249brd_input:
1250 if (skb->protocol != __constant_htons(ETH_P_IP))
1251 return -EINVAL;
1252
1253 if (ZERONET(saddr)) {
1254 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1255 } else {
1256 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag);
1257 if (err < 0)
1258 goto martian_source;
1259 if (err)
1260 flags |= RTCF_DIRECTSRC;
1261 }
1262 flags |= RTCF_BROADCAST;
1263 res.type = RTN_BROADCAST;
1264
1265local_input:
1266 rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
1267 if (!rth)
1268 return -ENOBUFS;
1269
1270 rth->u.dst.output= ip_rt_bug;
1271
1272 atomic_set(&rth->u.dst.use, 1);
1273 rth->key.dst = daddr;
1274 rth->rt_dst = daddr;
1275 rth->key.tos = tos;
1276#ifdef CONFIG_IP_ROUTE_FWMARK
1277 rth->key.fwmark = skb->fwmark;
1278#endif
1279 rth->key.src = saddr;
1280 rth->rt_src = saddr;
1281#ifdef CONFIG_IP_ROUTE_NAT
1282 rth->rt_dst_map = key.dst;
1283 rth->rt_src_map = key.src;
1284#endif
1285#ifdef CONFIG_NET_CLS_ROUTE
1286 rth->u.dst.tclassid = itag;
1287#endif
1288 rth->rt_iif =
1289 rth->key.iif = dev->ifindex;
1290 rth->u.dst.dev = &loopback_dev;
1291 rth->key.oif = 0;
1292 rth->rt_gateway = daddr;
1293 rth->rt_spec_dst= spec_dst;
1294 rth->u.dst.input= ip_local_deliver;
1295 rth->rt_flags = flags|RTCF_LOCAL;
1296 if (res.type == RTN_UNREACHABLE) {
1297 rth->u.dst.input= ip_error;
1298 rth->u.dst.error= -err;
1299 rth->rt_flags &= ~RTCF_LOCAL;
1300 }
1301 rth->rt_type = res.type;
1302 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1303
1304no_route:
1305 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1306 res.type = RTN_UNREACHABLE;
1307 goto local_input;
1308
1309
1310
1311
1312martian_destination:
1313#ifdef CONFIG_IP_ROUTE_VERBOSE
1314 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1315 printk(KERN_WARNING "martian destination %08x from %08x, dev %s\n", daddr, saddr, dev->name);
1316#endif
1317 return -EINVAL;
1318
1319martian_source:
1320#ifdef CONFIG_IP_ROUTE_VERBOSE
1321 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1322
1323
1324
1325
1326 printk(KERN_WARNING "martian source %08x for %08x, dev %s\n", saddr, daddr, dev->name);
1327 if (dev->hard_header_len) {
1328 int i;
1329 unsigned char *p = skb->mac.raw;
1330 printk(KERN_WARNING "ll header:");
1331 for (i=0; i<dev->hard_header_len; i++, p++)
1332 printk(" %02x", *p);
1333 printk("\n");
1334 }
1335 }
1336#endif
1337 return -EINVAL;
1338}
1339
1340int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1341 u8 tos, struct device *dev)
1342{
1343 struct rtable * rth;
1344 unsigned hash;
1345 int iif = dev->ifindex;
1346
1347 tos &= IPTOS_TOS_MASK;
1348 hash = rt_hash_code(daddr, saddr^(iif<<5), tos);
1349
1350 for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
1351 if (rth->key.dst == daddr &&
1352 rth->key.src == saddr &&
1353 rth->key.iif == iif &&
1354 rth->key.oif == 0 &&
1355#ifdef CONFIG_IP_ROUTE_FWMARK
1356 rth->key.fwmark == skb->fwmark &&
1357#endif
1358 rth->key.tos == tos) {
1359 rth->u.dst.lastuse = jiffies;
1360 atomic_inc(&rth->u.dst.use);
1361 atomic_inc(&rth->u.dst.refcnt);
1362 skb->dst = (struct dst_entry*)rth;
1363 return 0;
1364 }
1365 }
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378 if (MULTICAST(daddr)) {
1379 int our = ip_check_mc(dev, daddr);
1380 if (!our
1381#ifdef CONFIG_IP_MROUTE
1382 && (LOCAL_MCAST(daddr) || !dev->ip_ptr ||
1383 !IN_DEV_MFORWARD((struct in_device*)dev->ip_ptr))
1384#endif
1385 ) return -EINVAL;
1386 return ip_route_input_mc(skb, daddr, saddr, tos, dev, our);
1387 }
1388 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1389}
1390
1391
1392
1393
1394
1395int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
1396{
1397 struct rt_key key;
1398 struct fib_result res;
1399 unsigned flags = 0;
1400 struct rtable *rth;
1401 struct device *dev_out = NULL;
1402 unsigned hash;
1403#ifdef CONFIG_IP_TRANSPARENT_PROXY
1404 u32 nochecksrc = (tos & RTO_TPROXY);
1405#endif
1406
1407 tos &= IPTOS_TOS_MASK|RTO_ONLINK;
1408 key.dst = daddr;
1409 key.src = saddr;
1410 key.tos = tos&IPTOS_TOS_MASK;
1411 key.iif = loopback_dev.ifindex;
1412 key.oif = oif;
1413 key.scope = (tos&RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
1414 res.fi = NULL;
1415#ifdef CONFIG_IP_MULTIPLE_TABLES
1416 res.r = NULL;
1417#endif
1418
1419 if (saddr) {
1420 if (MULTICAST(saddr) || BADCLASS(saddr) || ZERONET(saddr))
1421 return -EINVAL;
1422
1423
1424 dev_out = ip_dev_find(saddr);
1425#ifdef CONFIG_IP_TRANSPARENT_PROXY
1426
1427
1428
1429 if (dev_out == NULL) {
1430 if (nochecksrc == 0 || inet_addr_type(saddr) != RTN_UNICAST)
1431 return -EINVAL;
1432 flags |= RTCF_TPROXY;
1433 }
1434#else
1435 if (dev_out == NULL)
1436 return -EINVAL;
1437#endif
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447 if (oif == 0 &&
1448#ifdef CONFIG_IP_TRANSPARENT_PROXY
1449 dev_out &&
1450#endif
1451 (MULTICAST(daddr) || daddr == 0xFFFFFFFF)) {
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467 key.oif = dev_out->ifindex;
1468 goto make_route;
1469 }
1470 dev_out = NULL;
1471 }
1472 if (oif) {
1473 dev_out = dev_get_by_index(oif);
1474 if (dev_out == NULL)
1475 return -ENODEV;
1476 if (dev_out->ip_ptr == NULL)
1477 return -ENODEV;
1478
1479 if (LOCAL_MCAST(daddr) || daddr == 0xFFFFFFFF) {
1480 key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK);
1481 goto make_route;
1482 }
1483 if (MULTICAST(daddr))
1484 key.src = inet_select_addr(dev_out, 0, key.scope);
1485 else if (!daddr)
1486 key.src = inet_select_addr(dev_out, 0, RT_SCOPE_HOST);
1487 }
1488
1489 if (!key.dst) {
1490 key.dst = key.src;
1491 if (!key.dst)
1492 key.dst = key.src = htonl(INADDR_LOOPBACK);
1493 dev_out = &loopback_dev;
1494 key.oif = loopback_dev.ifindex;
1495 res.type = RTN_LOCAL;
1496 flags |= RTCF_LOCAL;
1497 goto make_route;
1498 }
1499
1500 if (fib_lookup(&key, &res)) {
1501 res.fi = NULL;
1502 if (oif) {
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521 if (key.src == 0)
1522 key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK);
1523 res.type = RTN_UNICAST;
1524 goto make_route;
1525 }
1526 return -ENETUNREACH;
1527 }
1528
1529 if (res.type == RTN_NAT)
1530 return -EINVAL;
1531
1532 if (res.type == RTN_LOCAL) {
1533 if (!key.src)
1534 key.src = key.dst;
1535 dev_out = &loopback_dev;
1536 key.oif = dev_out->ifindex;
1537 res.fi = NULL;
1538 flags |= RTCF_LOCAL;
1539 goto make_route;
1540 }
1541
1542#ifdef CONFIG_IP_ROUTE_MULTIPATH
1543 if (res.fi->fib_nhs > 1 && key.oif == 0)
1544 fib_select_multipath(&key, &res);
1545 else
1546#endif
1547 if (res.prefixlen==0 && res.type == RTN_UNICAST && key.oif == 0)
1548 fib_select_default(&key, &res);
1549
1550 if (!key.src)
1551 key.src = FIB_RES_PREFSRC(res);
1552
1553 dev_out = FIB_RES_DEV(res);
1554 key.oif = dev_out->ifindex;
1555
1556make_route:
1557 if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK))
1558 return -EINVAL;
1559
1560 if (key.dst == 0xFFFFFFFF)
1561 res.type = RTN_BROADCAST;
1562 else if (MULTICAST(key.dst))
1563 res.type = RTN_MULTICAST;
1564 else if (BADCLASS(key.dst) || ZERONET(key.dst))
1565 return -EINVAL;
1566
1567 if (dev_out->flags&IFF_LOOPBACK)
1568 flags |= RTCF_LOCAL;
1569
1570 if (res.type == RTN_BROADCAST) {
1571 flags |= RTCF_BROADCAST|RTCF_LOCAL;
1572 res.fi = NULL;
1573 } else if (res.type == RTN_MULTICAST) {
1574 flags |= RTCF_MULTICAST|RTCF_LOCAL;
1575 if (!ip_check_mc(dev_out, daddr))
1576 flags &= ~RTCF_LOCAL;
1577
1578
1579
1580
1581 if (res.fi && res.prefixlen < 4)
1582 res.fi = NULL;
1583 }
1584
1585 rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
1586 if (!rth)
1587 return -ENOBUFS;
1588
1589 atomic_set(&rth->u.dst.use, 1);
1590 rth->key.dst = daddr;
1591 rth->key.tos = tos;
1592 rth->key.src = saddr;
1593 rth->key.iif = 0;
1594 rth->key.oif = oif;
1595 rth->rt_dst = key.dst;
1596 rth->rt_src = key.src;
1597#ifdef CONFIG_IP_ROUTE_NAT
1598 rth->rt_dst_map = key.dst;
1599 rth->rt_src_map = key.src;
1600#endif
1601 rth->rt_iif = oif ? : dev_out->ifindex;
1602 rth->u.dst.dev = dev_out;
1603 rth->rt_gateway = key.dst;
1604 rth->rt_spec_dst= key.src;
1605
1606 rth->u.dst.output=ip_output;
1607
1608 if (flags&RTCF_LOCAL) {
1609 rth->u.dst.input = ip_local_deliver;
1610 rth->rt_spec_dst = key.dst;
1611 }
1612 if (flags&(RTCF_BROADCAST|RTCF_MULTICAST)) {
1613 rth->rt_spec_dst = key.src;
1614 if (flags&RTCF_LOCAL && !(dev_out->flags&IFF_LOOPBACK))
1615 rth->u.dst.output = ip_mc_output;
1616#ifdef CONFIG_IP_MROUTE
1617 if (res.type == RTN_MULTICAST && dev_out->ip_ptr) {
1618 struct in_device *in_dev = dev_out->ip_ptr;
1619 if (IN_DEV_MFORWARD(in_dev) && !LOCAL_MCAST(daddr)) {
1620 rth->u.dst.input = ip_mr_input;
1621 rth->u.dst.output = ip_mc_output;
1622 }
1623 }
1624#endif
1625 }
1626
1627 rt_set_nexthop(rth, &res, 0);
1628
1629 rth->rt_flags = flags;
1630
1631 hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
1632 return rt_intern_hash(hash, rth, rp);
1633}
1634
1635int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
1636{
1637 unsigned hash;
1638 struct rtable *rth;
1639
1640 hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
1641
1642 start_bh_atomic();
1643 for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
1644 if (rth->key.dst == daddr &&
1645 rth->key.src == saddr &&
1646 rth->key.iif == 0 &&
1647 rth->key.oif == oif &&
1648#ifndef CONFIG_IP_TRANSPARENT_PROXY
1649 rth->key.tos == tos
1650#else
1651 !((rth->key.tos^tos)&(IPTOS_TOS_MASK|RTO_ONLINK)) &&
1652 ((tos&RTO_TPROXY) || !(rth->rt_flags&RTCF_TPROXY))
1653#endif
1654 ) {
1655 rth->u.dst.lastuse = jiffies;
1656 atomic_inc(&rth->u.dst.use);
1657 atomic_inc(&rth->u.dst.refcnt);
1658 end_bh_atomic();
1659 *rp = rth;
1660 return 0;
1661 }
1662 }
1663 end_bh_atomic();
1664
1665 return ip_route_output_slow(rp, daddr, saddr, tos, oif);
1666}
1667
1668#ifdef CONFIG_RTNETLINK
1669
1670static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait)
1671{
1672 struct rtable *rt = (struct rtable*)skb->dst;
1673 struct rtmsg *r;
1674 struct nlmsghdr *nlh;
1675 unsigned char *b = skb->tail;
1676 struct rta_cacheinfo ci;
1677#ifdef CONFIG_IP_MROUTE
1678 struct rtattr *eptr;
1679#endif
1680 struct rtattr *mx;
1681
1682 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
1683 r = NLMSG_DATA(nlh);
1684 nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
1685 r->rtm_family = AF_INET;
1686 r->rtm_dst_len = 32;
1687 r->rtm_src_len = 0;
1688 r->rtm_tos = rt->key.tos;
1689 r->rtm_table = RT_TABLE_MAIN;
1690 r->rtm_type = rt->rt_type;
1691 r->rtm_scope = RT_SCOPE_UNIVERSE;
1692 r->rtm_protocol = RTPROT_UNSPEC;
1693 r->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED;
1694 if (rt->rt_flags & RTCF_NOTIFY)
1695 r->rtm_flags |= RTM_F_NOTIFY;
1696 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
1697 if (rt->key.src) {
1698 r->rtm_src_len = 32;
1699 RTA_PUT(skb, RTA_SRC, 4, &rt->key.src);
1700 }
1701 if (rt->u.dst.dev)
1702 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
1703#ifdef CONFIG_NET_CLS_ROUTE
1704 if (rt->u.dst.tclassid)
1705 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
1706#endif
1707 if (rt->key.iif)
1708 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
1709 else if (rt->rt_src != rt->key.src)
1710 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
1711 if (rt->rt_dst != rt->rt_gateway)
1712 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
1713 mx = (struct rtattr*)skb->tail;
1714 RTA_PUT(skb, RTA_METRICS, 0, NULL);
1715 if (rt->u.dst.mxlock)
1716 RTA_PUT(skb, RTAX_LOCK, sizeof(unsigned), &rt->u.dst.mxlock);
1717 if (rt->u.dst.pmtu)
1718 RTA_PUT(skb, RTAX_MTU, sizeof(unsigned), &rt->u.dst.pmtu);
1719 if (rt->u.dst.window)
1720 RTA_PUT(skb, RTAX_WINDOW, sizeof(unsigned), &rt->u.dst.window);
1721 if (rt->u.dst.rtt)
1722 RTA_PUT(skb, RTAX_RTT, sizeof(unsigned), &rt->u.dst.rtt);
1723 mx->rta_len = skb->tail - (u8*)mx;
1724 if (mx->rta_len == RTA_LENGTH(0))
1725 skb_trim(skb, (u8*)mx - skb->data);
1726 ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
1727 ci.rta_used = atomic_read(&rt->u.dst.refcnt);
1728 ci.rta_clntref = atomic_read(&rt->u.dst.use);
1729 if (rt->u.dst.expires)
1730 ci.rta_expires = rt->u.dst.expires - jiffies;
1731 else
1732 ci.rta_expires = 0;
1733 ci.rta_error = rt->u.dst.error;
1734#ifdef CONFIG_IP_MROUTE
1735 eptr = (struct rtattr*)skb->tail;
1736#endif
1737 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1738 if (rt->key.iif) {
1739#ifdef CONFIG_IP_MROUTE
1740 u32 dst = rt->rt_dst;
1741
1742 if (MULTICAST(dst) && !LOCAL_MCAST(dst) && ipv4_devconf.mc_forwarding) {
1743 int err = ipmr_get_route(skb, r, nowait);
1744 if (err <= 0) {
1745 if (!nowait) {
1746 if (err == 0)
1747 return 0;
1748 goto nlmsg_failure;
1749 } else {
1750 if (err == -EMSGSIZE)
1751 goto nlmsg_failure;
1752 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
1753 }
1754 }
1755 } else
1756#endif
1757 {
1758 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif);
1759 }
1760 }
1761
1762 nlh->nlmsg_len = skb->tail - b;
1763 return skb->len;
1764
1765nlmsg_failure:
1766rtattr_failure:
1767 skb_trim(skb, b - skb->data);
1768 return -1;
1769}
1770
1771int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1772{
1773 struct rtattr **rta = arg;
1774 struct rtmsg *rtm = NLMSG_DATA(nlh);
1775 struct rtable *rt = NULL;
1776 u32 dst = 0;
1777 u32 src = 0;
1778 int iif = 0;
1779 int err;
1780 struct sk_buff *skb;
1781
1782 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1783 if (skb == NULL)
1784 return -ENOBUFS;
1785
1786
1787
1788
1789 skb->mac.raw = skb->data;
1790 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
1791
1792 if (rta[RTA_SRC-1])
1793 memcpy(&src, RTA_DATA(rta[RTA_SRC-1]), 4);
1794 if (rta[RTA_DST-1])
1795 memcpy(&dst, RTA_DATA(rta[RTA_DST-1]), 4);
1796 if (rta[RTA_IIF-1])
1797 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1798
1799 if (iif) {
1800 struct device *dev;
1801 dev = dev_get_by_index(iif);
1802 if (!dev)
1803 return -ENODEV;
1804 skb->protocol = __constant_htons(ETH_P_IP);
1805 skb->dev = dev;
1806 start_bh_atomic();
1807 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
1808 end_bh_atomic();
1809 rt = (struct rtable*)skb->dst;
1810 if (!err && rt->u.dst.error)
1811 err = -rt->u.dst.error;
1812 } else {
1813 int oif = 0;
1814 if (rta[RTA_OIF-1])
1815 memcpy(&oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1816 err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif);
1817 }
1818 if (err) {
1819 kfree_skb(skb);
1820 return err;
1821 }
1822
1823 skb->dst = &rt->u.dst;
1824 if (rtm->rtm_flags & RTM_F_NOTIFY)
1825 rt->rt_flags |= RTCF_NOTIFY;
1826
1827 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1828
1829 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0);
1830 if (err == 0)
1831 return 0;
1832 if (err < 0)
1833 return -EMSGSIZE;
1834
1835 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1836 if (err < 0)
1837 return err;
1838 return 0;
1839}
1840
1841
1842int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
1843{
1844 struct rtable *rt;
1845 int h, s_h;
1846 int idx, s_idx;
1847
1848 s_h = cb->args[0];
1849 s_idx = idx = cb->args[1];
1850 for (h=0; h < RT_HASH_DIVISOR; h++) {
1851 if (h < s_h) continue;
1852 if (h > s_h)
1853 memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0]));
1854 start_bh_atomic();
1855 for (rt = rt_hash_table[h], idx = 0; rt; rt = rt->u.rt_next, idx++) {
1856 if (idx < s_idx)
1857 continue;
1858 skb->dst = dst_clone(&rt->u.dst);
1859 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
1860 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) {
1861 dst_release(xchg(&skb->dst, NULL));
1862 end_bh_atomic();
1863 goto done;
1864 }
1865 dst_release(xchg(&skb->dst, NULL));
1866 }
1867 end_bh_atomic();
1868 }
1869
1870done:
1871 cb->args[0] = h;
1872 cb->args[1] = idx;
1873 return skb->len;
1874}
1875
1876#endif
1877
1878void ip_rt_multicast_event(struct in_device *in_dev)
1879{
1880 rt_cache_flush(0);
1881}
1882
1883
1884
1885#ifdef CONFIG_SYSCTL
1886
1887static int flush_delay;
1888
1889static
1890int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
1891 void *buffer, size_t *lenp)
1892{
1893 if (write) {
1894 proc_dointvec(ctl, write, filp, buffer, lenp);
1895 rt_cache_flush(flush_delay);
1896 return 0;
1897 } else
1898 return -EINVAL;
1899}
1900
1901ctl_table ipv4_route_table[] = {
1902 {NET_IPV4_ROUTE_FLUSH, "flush",
1903 &flush_delay, sizeof(int), 0200, NULL,
1904 &ipv4_sysctl_rtcache_flush},
1905 {NET_IPV4_ROUTE_MIN_DELAY, "min_delay",
1906 &ip_rt_min_delay, sizeof(int), 0644, NULL,
1907 &proc_dointvec_jiffies},
1908 {NET_IPV4_ROUTE_MAX_DELAY, "max_delay",
1909 &ip_rt_max_delay, sizeof(int), 0644, NULL,
1910 &proc_dointvec_jiffies},
1911 {NET_IPV4_ROUTE_GC_THRESH, "gc_thresh",
1912 &ipv4_dst_ops.gc_thresh, sizeof(int), 0644, NULL,
1913 &proc_dointvec},
1914 {NET_IPV4_ROUTE_MAX_SIZE, "max_size",
1915 &ip_rt_max_size, sizeof(int), 0644, NULL,
1916 &proc_dointvec},
1917 {NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval",
1918 &ip_rt_gc_min_interval, sizeof(int), 0644, NULL,
1919 &proc_dointvec_jiffies},
1920 {NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout",
1921 &ip_rt_gc_timeout, sizeof(int), 0644, NULL,
1922 &proc_dointvec_jiffies},
1923 {NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval",
1924 &ip_rt_gc_interval, sizeof(int), 0644, NULL,
1925 &proc_dointvec_jiffies},
1926 {NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load",
1927 &ip_rt_redirect_load, sizeof(int), 0644, NULL,
1928 &proc_dointvec},
1929 {NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number",
1930 &ip_rt_redirect_number, sizeof(int), 0644, NULL,
1931 &proc_dointvec},
1932 {NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence",
1933 &ip_rt_redirect_silence, sizeof(int), 0644, NULL,
1934 &proc_dointvec},
1935 {NET_IPV4_ROUTE_ERROR_COST, "error_cost",
1936 &ip_rt_error_cost, sizeof(int), 0644, NULL,
1937 &proc_dointvec},
1938 {NET_IPV4_ROUTE_ERROR_BURST, "error_burst",
1939 &ip_rt_error_burst, sizeof(int), 0644, NULL,
1940 &proc_dointvec},
1941 {NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity",
1942 &ip_rt_gc_elasticity, sizeof(int), 0644, NULL,
1943 &proc_dointvec},
1944 {NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires",
1945 &ip_rt_mtu_expires, sizeof(int), 0644, NULL,
1946 &proc_dointvec_jiffies},
1947 {0}
1948};
1949#endif
1950
1951#ifdef CONFIG_NET_CLS_ROUTE
1952struct ip_rt_acct ip_rt_acct[256];
1953
1954#ifdef CONFIG_PROC_FS
1955static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
1956 int length, int *eof, void *data)
1957{
1958 *start=buffer;
1959
1960 if (offset + length > sizeof(ip_rt_acct)) {
1961 length = sizeof(ip_rt_acct) - offset;
1962 *eof = 1;
1963 }
1964 if (length > 0) {
1965 start_bh_atomic();
1966 memcpy(buffer, ((u8*)&ip_rt_acct)+offset, length);
1967 end_bh_atomic();
1968 return length;
1969 }
1970 return 0;
1971}
1972#endif
1973#endif
1974
1975
1976__initfunc(void ip_rt_init(void))
1977{
1978#ifdef CONFIG_PROC_FS
1979#ifdef CONFIG_NET_CLS_ROUTE
1980 struct proc_dir_entry *ent;
1981#endif
1982#endif
1983 devinet_init();
1984 ip_fib_init();
1985 rt_periodic_timer.function = rt_check_expire;
1986
1987
1988
1989 rt_periodic_timer.expires = jiffies + net_random()%ip_rt_gc_interval
1990 + ip_rt_gc_interval;
1991 add_timer(&rt_periodic_timer);
1992
1993#ifdef CONFIG_PROC_FS
1994 proc_net_register(&(struct proc_dir_entry) {
1995 PROC_NET_RTCACHE, 8, "rt_cache",
1996 S_IFREG | S_IRUGO, 1, 0, 0,
1997 0, &proc_net_inode_operations,
1998 rt_cache_get_info
1999 });
2000#ifdef CONFIG_NET_CLS_ROUTE
2001 ent = create_proc_entry("net/rt_acct", 0, 0);
2002 ent->read_proc = ip_rt_acct_read;
2003#endif
2004#endif
2005}
2006