linux-old/net/ipv4/ip_output.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              The Internet Protocol (IP) output module.
   7 *
   8 * Version:     $Id: ip_output.c,v 1.67 1999/03/25 00:43:00 davem Exp $
   9 *
  10 * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Donald Becker, <becker@super.org>
  13 *              Alan Cox, <Alan.Cox@linux.org>
  14 *              Richard Underwood
  15 *              Stefan Becker, <stefanb@yello.ping.de>
  16 *              Jorge Cwik, <jorge@laser.satlink.net>
  17 *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18 *
  19 *      See ip_input.c for original log
  20 *
  21 *      Fixes:
  22 *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  23 *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  24 *              Bradford Johnson:       Fix faulty handling of some frames when 
  25 *                                      no route is found.
  26 *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  27 *                                      (in case if packet not accepted by
  28 *                                      output firewall rules)
  29 *              Mike McLagan    :       Routing by source
  30 *              Alexey Kuznetsov:       use new route cache
  31 *              Andi Kleen:             Fix broken PMTU recovery and remove
  32 *                                      some redundant tests.
  33 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  34 *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  35 *              Andi Kleen      :       Split fast and slow ip_build_xmit path 
  36 *                                      for decreased register pressure on x86 
  37 *                                      and more readibility. 
  38 *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  39 *                                      silently drop skb instead of failing with -EPERM.
  40 */
  41
  42#include <asm/uaccess.h>
  43#include <asm/system.h>
  44#include <linux/types.h>
  45#include <linux/kernel.h>
  46#include <linux/sched.h>
  47#include <linux/mm.h>
  48#include <linux/string.h>
  49#include <linux/errno.h>
  50#include <linux/config.h>
  51
  52#include <linux/socket.h>
  53#include <linux/sockios.h>
  54#include <linux/in.h>
  55#include <linux/inet.h>
  56#include <linux/netdevice.h>
  57#include <linux/etherdevice.h>
  58#include <linux/proc_fs.h>
  59#include <linux/stat.h>
  60#include <linux/init.h>
  61
  62#include <net/snmp.h>
  63#include <net/ip.h>
  64#include <net/protocol.h>
  65#include <net/route.h>
  66#include <net/tcp.h>
  67#include <net/udp.h>
  68#include <linux/skbuff.h>
  69#include <net/sock.h>
  70#include <net/arp.h>
  71#include <net/icmp.h>
  72#include <net/raw.h>
  73#include <net/checksum.h>
  74#include <linux/igmp.h>
  75#include <linux/ip_fw.h>
  76#include <linux/firewall.h>
  77#include <linux/mroute.h>
  78#include <linux/netlink.h>
  79
  80/*
  81 *      Shall we try to damage output packets if routing dev changes?
  82 */
  83
  84int sysctl_ip_dynaddr = 0;
  85
  86
  87int ip_id_count = 0;
  88
  89/* Generate a checksum for an outgoing IP datagram. */
  90__inline__ void ip_send_check(struct iphdr *iph)
  91{
  92        iph->check = 0;
  93        iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  94}
  95
  96/* 
  97 *              Add an ip header to a skbuff and send it out.
  98 */
  99void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 100                           u32 saddr, u32 daddr, struct ip_options *opt)
 101{
 102        struct rtable *rt = (struct rtable *)skb->dst;
 103        struct iphdr *iph;
 104        struct device *dev;
 105        
 106        /* Build the IP header. */
 107        if (opt)
 108                iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
 109        else
 110                iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
 111
 112        iph->version  = 4;
 113        iph->ihl      = 5;
 114        iph->tos      = sk->ip_tos;
 115        iph->frag_off = 0;
 116        if (ip_dont_fragment(sk, &rt->u.dst))
 117                iph->frag_off |= htons(IP_DF);
 118        iph->ttl      = sk->ip_ttl;
 119        iph->daddr    = rt->rt_dst;
 120        iph->saddr    = rt->rt_src;
 121        iph->protocol = sk->protocol;
 122        iph->tot_len  = htons(skb->len);
 123        iph->id       = htons(ip_id_count++);
 124        skb->nh.iph   = iph;
 125
 126        if (opt && opt->optlen) {
 127                iph->ihl += opt->optlen>>2;
 128                ip_options_build(skb, opt, daddr, rt, 0);
 129        }
 130
 131        dev = rt->u.dst.dev;
 132
 133#ifdef CONFIG_FIREWALL
 134        /* Now we have no better mechanism to notify about error. */
 135        switch (call_out_firewall(PF_INET, dev, iph, NULL, &skb)) {
 136        case FW_REJECT:
 137                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 138                /* Fall thru... */
 139        case FW_BLOCK:
 140        case FW_QUEUE:
 141                kfree_skb(skb);
 142                return;
 143        }
 144#endif
 145
 146        ip_send_check(iph);
 147
 148        /* Send it out. */
 149        skb->dst->output(skb);
 150        return;
 151}
 152
 153int __ip_finish_output(struct sk_buff *skb)
 154{
 155        return ip_finish_output(skb);
 156}
 157
 158int ip_mc_output(struct sk_buff *skb)
 159{
 160        struct sock *sk = skb->sk;
 161        struct rtable *rt = (struct rtable*)skb->dst;
 162        struct device *dev = rt->u.dst.dev;
 163
 164        /*
 165         *      If the indicated interface is up and running, send the packet.
 166         */
 167         
 168        ip_statistics.IpOutRequests++;
 169#ifdef CONFIG_IP_ROUTE_NAT
 170        if (rt->rt_flags & RTCF_NAT)
 171                ip_do_nat(skb);
 172#endif
 173
 174        skb->dev = dev;
 175        skb->protocol = __constant_htons(ETH_P_IP);
 176
 177        /*
 178         *      Multicasts are looped back for other local users
 179         */
 180
 181        if (rt->rt_flags&RTCF_MULTICAST && (!sk || sk->ip_mc_loop)) {
 182#ifdef CONFIG_IP_MROUTE
 183                /* Small optimization: do not loopback not local frames,
 184                   which returned after forwarding; they will be  dropped
 185                   by ip_mr_input in any case.
 186                   Note, that local frames are looped back to be delivered
 187                   to local recipients.
 188
 189                   This check is duplicated in ip_mr_input at the moment.
 190                 */
 191                if ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 192#endif
 193                dev_loopback_xmit(skb);
 194
 195                /* Multicasts with ttl 0 must not go beyond the host */
 196
 197                if (skb->nh.iph->ttl == 0) {
 198                        kfree_skb(skb);
 199                        return 0;
 200                }
 201        }
 202
 203        if (rt->rt_flags&RTCF_BROADCAST)
 204                dev_loopback_xmit(skb);
 205
 206        return ip_finish_output(skb);
 207}
 208
 209int ip_output(struct sk_buff *skb)
 210{
 211#ifdef CONFIG_IP_ROUTE_NAT
 212        struct rtable *rt = (struct rtable*)skb->dst;
 213#endif
 214
 215        ip_statistics.IpOutRequests++;
 216
 217#ifdef CONFIG_IP_ROUTE_NAT
 218        if (rt->rt_flags&RTCF_NAT)
 219                ip_do_nat(skb);
 220#endif
 221
 222        return ip_finish_output(skb);
 223}
 224
 225/* Queues a packet to be sent, and starts the transmitter if necessary.  
 226 * This routine also needs to put in the total length and compute the 
 227 * checksum.  We use to do this in two stages, ip_build_header() then
 228 * this, but that scheme created a mess when routes disappeared etc.
 229 * So we do it all here, and the TCP send engine has been changed to
 230 * match. (No more unroutable FIN disasters, etc. wheee...)  This will
 231 * most likely make other reliable transport layers above IP easier
 232 * to implement under Linux.
 233 */
 234void ip_queue_xmit(struct sk_buff *skb)
 235{
 236        struct sock *sk = skb->sk;
 237        struct ip_options *opt = sk->opt;
 238        struct rtable *rt;
 239        struct device *dev;
 240        struct iphdr *iph;
 241        unsigned int tot_len;
 242
 243        /* Make sure we can route this packet. */
 244        rt = (struct rtable *) sk->dst_cache;
 245        if(rt == NULL || rt->u.dst.obsolete) {
 246                u32 daddr;
 247
 248                sk->dst_cache = NULL;
 249                ip_rt_put(rt);
 250
 251                /* Use correct destination address if we have options. */
 252                daddr = sk->daddr;
 253                if(opt && opt->srr)
 254                        daddr = opt->faddr;
 255
 256                /* If this fails, retransmit mechanism of transport layer will
 257                 * keep trying until route appears or the connection times itself
 258                 * out.
 259                 */
 260                if(ip_route_output(&rt, daddr, sk->saddr,
 261                                   RT_TOS(sk->ip_tos) | RTO_CONN | sk->localroute,
 262                                   sk->bound_dev_if))
 263                        goto drop;
 264                sk->dst_cache = &rt->u.dst;
 265        }
 266        if(opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 267                goto no_route;
 268
 269        /* We have a route, so grab a reference. */
 270        skb->dst = dst_clone(sk->dst_cache);
 271
 272        /* OK, we know where to send it, allocate and build IP header. */
 273        iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 274        iph->version  = 4;
 275        iph->ihl      = 5;
 276        iph->tos      = sk->ip_tos;
 277        iph->frag_off = 0;
 278        iph->ttl      = sk->ip_ttl;
 279        iph->daddr    = rt->rt_dst;
 280        iph->saddr    = rt->rt_src;
 281        iph->protocol = sk->protocol;
 282        skb->nh.iph   = iph;
 283        /* Transport layer set skb->h.foo itself. */
 284
 285        if(opt && opt->optlen) {
 286                iph->ihl += opt->optlen >> 2;
 287                ip_options_build(skb, opt, sk->daddr, rt, 0);
 288        }
 289
 290        tot_len = skb->len;
 291        iph->tot_len = htons(tot_len);
 292        iph->id = htons(ip_id_count++);
 293
 294        dev = rt->u.dst.dev;
 295
 296#ifdef CONFIG_FIREWALL
 297        /* Now we have no better mechanism to notify about error. */
 298        switch (call_out_firewall(PF_INET, dev, iph, NULL, &skb)) {
 299        case FW_REJECT:
 300                start_bh_atomic();
 301                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 302                end_bh_atomic();
 303                /* Fall thru... */
 304        case FW_BLOCK:
 305        case FW_QUEUE:
 306                goto drop;
 307        }
 308#endif
 309
 310        /* This can happen when the transport layer has segments queued
 311         * with a cached route, and by the time we get here things are
 312         * re-routed to a device with a different MTU than the original
 313         * device.  Sick, but we must cover it.
 314         */
 315        if (skb_headroom(skb) < dev->hard_header_len && dev->hard_header) {
 316                struct sk_buff *skb2;
 317
 318                skb2 = skb_realloc_headroom(skb, (dev->hard_header_len + 15) & ~15);
 319                kfree_skb(skb);
 320                if (skb2 == NULL)
 321                        return;
 322                if (sk)
 323                        skb_set_owner_w(skb, sk);
 324                skb = skb2;
 325                iph = skb->nh.iph;
 326        }
 327
 328        /* Do we need to fragment.  Again this is inefficient.  We
 329         * need to somehow lock the original buffer and use bits of it.
 330         */
 331        if (tot_len > rt->u.dst.pmtu)
 332                goto fragment;
 333
 334        if (ip_dont_fragment(sk, &rt->u.dst))
 335                iph->frag_off |= __constant_htons(IP_DF);
 336
 337        /* Add an IP checksum. */
 338        ip_send_check(iph);
 339
 340        skb->priority = sk->priority;
 341        skb->dst->output(skb);
 342        return;
 343
 344fragment:
 345        if (ip_dont_fragment(sk, &rt->u.dst) &&
 346            tot_len > (iph->ihl<<2) + sizeof(struct tcphdr)+16) {
 347                /* Reject packet ONLY if TCP might fragment
 348                   it itself, if were careful enough.
 349                   Test is not precise (f.e. it does not take sacks
 350                   into account). Actually, tcp should make it. --ANK (980801)
 351                 */
 352                iph->frag_off |= __constant_htons(IP_DF);
 353                NETDEBUG(printk(KERN_DEBUG "sending pkt_too_big to self\n"));
 354
 355                /* icmp_send is not reenterable, so that bh_atomic... --ANK */
 356                start_bh_atomic();
 357                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 358                          htonl(rt->u.dst.pmtu));
 359                end_bh_atomic();
 360                goto drop;
 361        }
 362        ip_fragment(skb, skb->dst->output);
 363        return;
 364
 365no_route:
 366        sk->dst_cache = NULL;
 367        ip_rt_put(rt);
 368        ip_statistics.IpOutNoRoutes++;
 369        /* Fall through... */
 370drop:
 371        kfree_skb(skb);
 372}
 373
 374/*
 375 *      Build and send a packet, with as little as one copy
 376 *
 377 *      Doesn't care much about ip options... option length can be
 378 *      different for fragment at 0 and other fragments.
 379 *
 380 *      Note that the fragment at the highest offset is sent first,
 381 *      so the getfrag routine can fill in the TCP/UDP checksum header
 382 *      field in the last fragment it sends... actually it also helps
 383 *      the reassemblers, they can put most packets in at the head of
 384 *      the fragment queue, and they know the total size in advance. This
 385 *      last feature will measurably improve the Linux fragment handler one
 386 *      day.
 387 *
 388 *      The callback has five args, an arbitrary pointer (copy of frag),
 389 *      the source IP address (may depend on the routing table), the 
 390 *      destination address (char *), the offset to copy from, and the
 391 *      length to be copied.
 392 */
 393
 394int ip_build_xmit_slow(struct sock *sk,
 395                  int getfrag (const void *,
 396                               char *,
 397                               unsigned int,    
 398                               unsigned int),
 399                  const void *frag,
 400                  unsigned length,
 401                  struct ipcm_cookie *ipc,
 402                  struct rtable *rt,
 403                  int flags)
 404{
 405        unsigned int fraglen, maxfraglen, fragheaderlen;
 406        int err;
 407        int offset, mf;
 408        int mtu;
 409        unsigned short id;
 410
 411        int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
 412        int nfrags=0;
 413        struct ip_options *opt = ipc->opt;
 414        int df = 0;
 415
 416        mtu = rt->u.dst.pmtu;
 417        if (ip_dont_fragment(sk, &rt->u.dst))
 418                df = htons(IP_DF);
 419  
 420        length -= sizeof(struct iphdr);
 421
 422        if (opt) {
 423                fragheaderlen = sizeof(struct iphdr) + opt->optlen;
 424                maxfraglen = ((mtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen;
 425        } else {
 426                fragheaderlen = sizeof(struct iphdr);
 427                
 428                /*
 429                 *      Fragheaderlen is the size of 'overhead' on each buffer. Now work
 430                 *      out the size of the frames to send.
 431                 */
 432         
 433                maxfraglen = ((mtu-sizeof(struct iphdr)) & ~7) + fragheaderlen;
 434        }
 435
 436        if (length + fragheaderlen > 0xFFFF) {
 437                ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
 438                return -EMSGSIZE;
 439        }
 440
 441        /*
 442         *      Start at the end of the frame by handling the remainder.
 443         */
 444         
 445        offset = length - (length % (maxfraglen - fragheaderlen));
 446        
 447        /*
 448         *      Amount of memory to allocate for final fragment.
 449         */
 450         
 451        fraglen = length - offset + fragheaderlen;
 452        
 453        if (length-offset==0) {
 454                fraglen = maxfraglen;
 455                offset -= maxfraglen-fragheaderlen;
 456        }
 457        
 458        
 459        /*
 460         *      The last fragment will not have MF (more fragments) set.
 461         */
 462         
 463        mf = 0;
 464
 465        /*
 466         *      Don't fragment packets for path mtu discovery.
 467         */
 468         
 469        if (offset > 0 && df) { 
 470                ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
 471                return(-EMSGSIZE);
 472        }
 473
 474        /*
 475         *      Lock the device lists.
 476         */
 477
 478        dev_lock_list();
 479        
 480        /*
 481         *      Get an identifier
 482         */
 483         
 484        id = htons(ip_id_count++);
 485
 486        /*
 487         *      Begin outputting the bytes.
 488         */
 489         
 490        do {
 491                char *data;
 492                struct sk_buff * skb;
 493
 494                /*
 495                 *      Get the memory we require with some space left for alignment.
 496                 */
 497
 498                skb = sock_alloc_send_skb(sk, fraglen+hh_len+15, 0, flags&MSG_DONTWAIT, &err);
 499                if (skb == NULL)
 500                        goto error;
 501
 502                /*
 503                 *      Fill in the control structures
 504                 */
 505                 
 506                skb->priority = sk->priority;
 507                skb->dst = dst_clone(&rt->u.dst);
 508                skb_reserve(skb, hh_len);
 509
 510                /*
 511                 *      Find where to start putting bytes.
 512                 */
 513                 
 514                data = skb_put(skb, fraglen);
 515                skb->nh.iph = (struct iphdr *)data;
 516
 517                /*
 518                 *      Only write IP header onto non-raw packets 
 519                 */
 520                 
 521                {
 522                        struct iphdr *iph = (struct iphdr *)data;
 523
 524                        iph->version = 4;
 525                        iph->ihl = 5;
 526                        if (opt) {
 527                                iph->ihl += opt->optlen>>2;
 528                                ip_options_build(skb, opt,
 529                                                 ipc->addr, rt, offset);
 530                        }
 531                        iph->tos = sk->ip_tos;
 532                        iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4);
 533                        iph->id = id;
 534                        iph->frag_off = htons(offset>>3);
 535                        iph->frag_off |= mf|df;
 536                        if (rt->rt_type == RTN_MULTICAST)
 537                                iph->ttl = sk->ip_mc_ttl;
 538                        else
 539                                iph->ttl = sk->ip_ttl;
 540                        iph->protocol = sk->protocol;
 541                        iph->check = 0;
 542                        iph->saddr = rt->rt_src;
 543                        iph->daddr = rt->rt_dst;
 544                        iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
 545                        data += iph->ihl*4;
 546                        
 547                        /*
 548                         *      Any further fragments will have MF set.
 549                         */
 550                         
 551                        mf = htons(IP_MF);
 552                }
 553                
 554                /*
 555                 *      User data callback
 556                 */
 557
 558                if (getfrag(frag, data, offset, fraglen-fragheaderlen)) {
 559                        err = -EFAULT;
 560                        kfree_skb(skb);
 561                        goto error;
 562                }
 563
 564                offset -= (maxfraglen-fragheaderlen);
 565                fraglen = maxfraglen;
 566
 567                nfrags++;
 568
 569#ifdef CONFIG_FIREWALL
 570                switch (call_out_firewall(PF_INET, rt->u.dst.dev, skb->nh.iph, NULL, &skb)) {
 571                case FW_QUEUE:
 572                        kfree_skb(skb);
 573                        continue;
 574                case FW_BLOCK:
 575                case FW_REJECT:
 576                        kfree_skb(skb);
 577                        err = -EPERM;
 578                        goto error;
 579                }
 580#endif
 581
 582                err = -ENETDOWN;
 583                if (rt->u.dst.output(skb))
 584                        goto error;
 585        } while (offset >= 0);
 586
 587        if (nfrags>1)
 588                ip_statistics.IpFragCreates += nfrags;
 589        dev_unlock_list();
 590        return 0;
 591
 592error:
 593        ip_statistics.IpOutDiscards++;
 594        if (nfrags>1)
 595                ip_statistics.IpFragCreates += nfrags;
 596        dev_unlock_list();
 597        return err; 
 598}
 599
 600
 601/*
 602 *      Fast path for unfragmented packets.
 603 */
 604int ip_build_xmit(struct sock *sk, 
 605                  int getfrag (const void *,
 606                               char *,
 607                               unsigned int,    
 608                               unsigned int),
 609                  const void *frag,
 610                  unsigned length,
 611                  struct ipcm_cookie *ipc,
 612                  struct rtable *rt,
 613                  int flags)
 614{
 615        int err;
 616        struct sk_buff *skb;
 617        int df;
 618        struct iphdr *iph;
 619
 620        /*
 621         *      Try the simple case first. This leaves fragmented frames, and by
 622         *      choice RAW frames within 20 bytes of maximum size(rare) to the long path
 623         */
 624
 625        if (!sk->ip_hdrincl) {
 626                length += sizeof(struct iphdr);
 627
 628                /*
 629                 *      Check for slow path.
 630                 */
 631                if (length > rt->u.dst.pmtu || ipc->opt != NULL)  
 632                        return ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags); 
 633        } else {
 634                if (length > rt->u.dst.dev->mtu) {
 635                        ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, rt->u.dst.dev->mtu);
 636                        return -EMSGSIZE;
 637                }
 638        }
 639
 640        /*
 641         *      Do path mtu discovery if needed.
 642         */
 643        df = 0;
 644        if (ip_dont_fragment(sk, &rt->u.dst))
 645                df = htons(IP_DF);
 646
 647        /* 
 648         *      Fast path for unfragmented frames without options. 
 649         */ 
 650        {
 651        int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
 652
 653        skb = sock_alloc_send_skb(sk, length+hh_len+15,
 654                                  0, flags&MSG_DONTWAIT, &err);
 655        if(skb==NULL)
 656                goto error; 
 657        skb_reserve(skb, hh_len);
 658        }
 659        
 660        skb->priority = sk->priority;
 661        skb->dst = dst_clone(&rt->u.dst);
 662
 663        skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
 664        
 665        dev_lock_list();
 666        
 667        if(!sk->ip_hdrincl) {
 668                iph->version=4;
 669                iph->ihl=5;
 670                iph->tos=sk->ip_tos;
 671                iph->tot_len = htons(length);
 672                iph->id=htons(ip_id_count++);
 673                iph->frag_off = df;
 674                iph->ttl=sk->ip_mc_ttl;
 675                if (rt->rt_type != RTN_MULTICAST)
 676                        iph->ttl=sk->ip_ttl;
 677                iph->protocol=sk->protocol;
 678                iph->saddr=rt->rt_src;
 679                iph->daddr=rt->rt_dst;
 680                iph->check=0;
 681                iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
 682                err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4);
 683        }
 684        else
 685                err = getfrag(frag, (void *)iph, 0, length);
 686
 687        dev_unlock_list();
 688
 689        if (err)
 690                goto error_fault;
 691
 692#ifdef CONFIG_FIREWALL
 693        switch (call_out_firewall(PF_INET, rt->u.dst.dev, iph, NULL, &skb)) {
 694        case FW_QUEUE:
 695                kfree_skb(skb);
 696                return 0;
 697        case FW_BLOCK:
 698        case FW_REJECT:
 699                kfree_skb(skb);
 700                err = -EPERM;
 701                goto error;
 702        }
 703#endif
 704
 705        return rt->u.dst.output(skb);
 706
 707error_fault:
 708        err = -EFAULT;
 709        kfree_skb(skb);
 710error:
 711        ip_statistics.IpOutDiscards++;
 712        return err; 
 713}
 714                       
 715
 716
 717/*
 718 *      This IP datagram is too large to be sent in one piece.  Break it up into
 719 *      smaller pieces (each of size equal to IP header plus
 720 *      a block of the data of the original IP data part) that will yet fit in a
 721 *      single device frame, and queue such a frame for sending.
 722 *
 723 *      Yes this is inefficient, feel free to submit a quicker one.
 724 */
 725
 726void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 727{
 728        struct iphdr *iph;
 729        unsigned char *raw;
 730        unsigned char *ptr;
 731        struct device *dev;
 732        struct sk_buff *skb2;
 733        unsigned int mtu, hlen, left, len; 
 734        int offset;
 735        int not_last_frag;
 736        struct rtable *rt = (struct rtable*)skb->dst;
 737
 738        dev = rt->u.dst.dev;
 739
 740        /*
 741         *      Point into the IP datagram header.
 742         */
 743
 744        raw = skb->nh.raw;
 745        iph = (struct iphdr*)raw;
 746
 747        /*
 748         *      Setup starting values.
 749         */
 750
 751        hlen = iph->ihl * 4;
 752        left = ntohs(iph->tot_len) - hlen;      /* Space per frame */
 753        mtu = rt->u.dst.pmtu - hlen;    /* Size of data space */
 754        ptr = raw + hlen;                       /* Where to start from */
 755
 756        /*
 757         *      The protocol doesn't seem to say what to do in the case that the
 758         *      frame + options doesn't fit the mtu. As it used to fall down dead
 759         *      in this case we were fortunate it didn't happen
 760         *
 761         *      It is impossible, because mtu>=68. --ANK (980801)
 762         */
 763
 764#ifdef CONFIG_NET_PARANOIA
 765        if (mtu<8) 
 766                goto fail;
 767#endif
 768
 769        /*
 770         *      Fragment the datagram.
 771         */
 772
 773        offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 774        not_last_frag = iph->frag_off & htons(IP_MF);
 775
 776        /*
 777         *      Keep copying data until we run out.
 778         */
 779
 780        while(left > 0) {
 781                len = left;
 782                /* IF: it doesn't fit, use 'mtu' - the data space left */
 783                if (len > mtu)
 784                        len = mtu;
 785                /* IF: we are not sending upto and including the packet end
 786                   then align the next start on an eight byte boundary */
 787                if (len < left) {
 788                        len &= ~7;
 789                }
 790                /*
 791                 *      Allocate buffer.
 792                 */
 793
 794                if ((skb2 = alloc_skb(len+hlen+dev->hard_header_len+15,GFP_ATOMIC)) == NULL) {
 795                        NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
 796                        goto fail;
 797                }
 798
 799                /*
 800                 *      Set up data on packet
 801                 */
 802
 803                skb2->pkt_type = skb->pkt_type;
 804                skb2->priority = skb->priority;
 805                skb_reserve(skb2, (dev->hard_header_len+15)&~15);
 806                skb_put(skb2, len + hlen);
 807                skb2->nh.raw = skb2->data;
 808                skb2->h.raw = skb2->data + hlen;
 809
 810                /*
 811                 *      Charge the memory for the fragment to any owner
 812                 *      it might possess
 813                 */
 814
 815                if (skb->sk)
 816                        skb_set_owner_w(skb2, skb->sk);
 817                skb2->dst = dst_clone(skb->dst);
 818
 819                /*
 820                 *      Copy the packet header into the new buffer.
 821                 */
 822
 823                memcpy(skb2->nh.raw, raw, hlen);
 824
 825                /*
 826                 *      Copy a block of the IP datagram.
 827                 */
 828                memcpy(skb2->h.raw, ptr, len);
 829                left -= len;
 830
 831                /*
 832                 *      Fill in the new header fields.
 833                 */
 834                iph = skb2->nh.iph;
 835                iph->frag_off = htons((offset >> 3));
 836
 837                /* ANK: dirty, but effective trick. Upgrade options only if
 838                 * the segment to be fragmented was THE FIRST (otherwise,
 839                 * options are already fixed) and make it ONCE
 840                 * on the initial skb, so that all the following fragments
 841                 * will inherit fixed options.
 842                 */
 843                if (offset == 0)
 844                        ip_options_fragment(skb);
 845
 846                /*
 847                 *      Added AC : If we are fragmenting a fragment that's not the
 848                 *                 last fragment then keep MF on each bit
 849                 */
 850                if (left > 0 || not_last_frag)
 851                        iph->frag_off |= htons(IP_MF);
 852                ptr += len;
 853                offset += len;
 854
 855                /*
 856                 *      Put this fragment into the sending queue.
 857                 */
 858
 859                ip_statistics.IpFragCreates++;
 860
 861                iph->tot_len = htons(len + hlen);
 862
 863                ip_send_check(iph);
 864
 865                output(skb2);
 866        }
 867        kfree_skb(skb);
 868        ip_statistics.IpFragOKs++;
 869        return;
 870        
 871fail:
 872        kfree_skb(skb); 
 873        ip_statistics.IpFragFails++; 
 874}
 875
 876/*
 877 *      Fetch data from kernel space and fill in checksum if needed.
 878 */
 879static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset, 
 880                              unsigned int fraglen)
 881{
 882        struct ip_reply_arg *dp = (struct ip_reply_arg*)dptr;
 883        u16 *pktp = (u16 *)to;
 884        struct iovec *iov; 
 885        int len; 
 886        int hdrflag = 1; 
 887
 888        iov = &dp->iov[0]; 
 889        if (offset >= iov->iov_len) { 
 890                offset -= iov->iov_len;
 891                iov++; 
 892                hdrflag = 0; 
 893        }
 894        len = iov->iov_len - offset;
 895        if (fraglen > len) { /* overlapping. */ 
 896                dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, len,
 897                                             dp->csum);
 898                offset = 0;
 899                fraglen -= len; 
 900                to += len; 
 901                iov++;
 902        }
 903
 904        dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, fraglen, 
 905                                             dp->csum); 
 906
 907        if (hdrflag && dp->csumoffset)
 908                *(pktp + dp->csumoffset) = csum_fold(dp->csum); /* fill in checksum */
 909        return 0;              
 910}
 911
 912/* 
 913 *      Generic function to send a packet as reply to another packet.
 914 *      Used to send TCP resets so far. ICMP should use this function too.
 915 *
 916 *      Should run single threaded per socket because it uses the sock 
 917 *      structure to pass arguments.
 918 */
 919void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
 920                   unsigned int len)
 921{
 922        struct {
 923                struct ip_options       opt;
 924                char                    data[40];
 925        } replyopts;
 926        struct ipcm_cookie ipc;
 927        u32 daddr;
 928        struct rtable *rt = (struct rtable*)skb->dst;
 929        
 930        if (ip_options_echo(&replyopts.opt, skb))
 931                return;
 932        
 933        sk->ip_tos = skb->nh.iph->tos;
 934        sk->priority = skb->priority;
 935        sk->protocol = skb->nh.iph->protocol;
 936
 937        daddr = ipc.addr = rt->rt_src;
 938        ipc.opt = &replyopts.opt;
 939        
 940        if (ipc.opt->srr)
 941                daddr = replyopts.opt.faddr;
 942        if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0))
 943                return;
 944
 945        /* And let IP do all the hard work. */
 946        ip_build_xmit(sk, ip_reply_glue_bits, arg, len, &ipc, rt, MSG_DONTWAIT);
 947        ip_rt_put(rt);
 948}
 949
 950/*
 951 *      IP protocol layer initialiser
 952 */
 953
 954static struct packet_type ip_packet_type =
 955{
 956        __constant_htons(ETH_P_IP),
 957        NULL,   /* All devices */
 958        ip_rcv,
 959        NULL,
 960        NULL,
 961};
 962
 963
 964
 965#ifdef CONFIG_PROC_FS
 966#ifdef CONFIG_IP_MULTICAST
 967static struct proc_dir_entry proc_net_igmp = {
 968        PROC_NET_IGMP, 4, "igmp",
 969        S_IFREG | S_IRUGO, 1, 0, 0,
 970        0, &proc_net_inode_operations,
 971        ip_mc_procinfo
 972};
 973#endif
 974#endif  
 975
 976/*
 977 *      IP registers the packet type and then calls the subprotocol initialisers
 978 */
 979
 980__initfunc(void ip_init(void))
 981{
 982        dev_add_pack(&ip_packet_type);
 983
 984        ip_rt_init();
 985
 986#ifdef CONFIG_PROC_FS
 987#ifdef CONFIG_IP_MULTICAST
 988        proc_net_register(&proc_net_igmp);
 989#endif
 990#endif  
 991}
 992
 993
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.