linux/net/ipv4/ip_output.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              The Internet Protocol (IP) output module.
   7 *
   8 * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Donald Becker, <becker@super.org>
  13 *              Alan Cox, <Alan.Cox@linux.org>
  14 *              Richard Underwood
  15 *              Stefan Becker, <stefanb@yello.ping.de>
  16 *              Jorge Cwik, <jorge@laser.satlink.net>
  17 *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18 *              Hirokazu Takahashi, <taka@valinux.co.jp>
  19 *
  20 *      See ip_input.c for original log
  21 *
  22 *      Fixes:
  23 *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  24 *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  25 *              Bradford Johnson:       Fix faulty handling of some frames when
  26 *                                      no route is found.
  27 *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  28 *                                      (in case if packet not accepted by
  29 *                                      output firewall rules)
  30 *              Mike McLagan    :       Routing by source
  31 *              Alexey Kuznetsov:       use new route cache
  32 *              Andi Kleen:             Fix broken PMTU recovery and remove
  33 *                                      some redundant tests.
  34 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  35 *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  36 *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  37 *                                      for decreased register pressure on x86
  38 *                                      and more readibility.
  39 *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  40 *                                      silently drop skb instead of failing with -EPERM.
  41 *              Detlev Wengorz  :       Copy protocol for fragments.
  42 *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  43 *                                      datagrams.
  44 *              Hirokazu Takahashi:     sendfile() on UDP works now.
  45 */
  46
  47#include <asm/uaccess.h>
  48#include <asm/system.h>
  49#include <linux/module.h>
  50#include <linux/types.h>
  51#include <linux/kernel.h>
  52#include <linux/mm.h>
  53#include <linux/string.h>
  54#include <linux/errno.h>
  55#include <linux/highmem.h>
  56
  57#include <linux/socket.h>
  58#include <linux/sockios.h>
  59#include <linux/in.h>
  60#include <linux/inet.h>
  61#include <linux/netdevice.h>
  62#include <linux/etherdevice.h>
  63#include <linux/proc_fs.h>
  64#include <linux/stat.h>
  65#include <linux/init.h>
  66
  67#include <net/snmp.h>
  68#include <net/ip.h>
  69#include <net/protocol.h>
  70#include <net/route.h>
  71#include <net/xfrm.h>
  72#include <linux/skbuff.h>
  73#include <net/sock.h>
  74#include <net/arp.h>
  75#include <net/icmp.h>
  76#include <net/checksum.h>
  77#include <net/inetpeer.h>
  78#include <linux/igmp.h>
  79#include <linux/netfilter_ipv4.h>
  80#include <linux/netfilter_bridge.h>
  81#include <linux/mroute.h>
  82#include <linux/netlink.h>
  83#include <linux/tcp.h>
  84
  85int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
  86
  87/* Generate a checksum for an outgoing IP datagram. */
  88__inline__ void ip_send_check(struct iphdr *iph)
  89{
  90        iph->check = 0;
  91        iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  92}
  93
  94int __ip_local_out(struct sk_buff *skb)
  95{
  96        struct iphdr *iph = ip_hdr(skb);
  97
  98        iph->tot_len = htons(skb->len);
  99        ip_send_check(iph);
 100        return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
 101                       dst_output);
 102}
 103
 104int ip_local_out(struct sk_buff *skb)
 105{
 106        int err;
 107
 108        err = __ip_local_out(skb);
 109        if (likely(err == 1))
 110                err = dst_output(skb);
 111
 112        return err;
 113}
 114EXPORT_SYMBOL_GPL(ip_local_out);
 115
 116/* dev_loopback_xmit for use with netfilter. */
 117static int ip_dev_loopback_xmit(struct sk_buff *newskb)
 118{
 119        skb_reset_mac_header(newskb);
 120        __skb_pull(newskb, skb_network_offset(newskb));
 121        newskb->pkt_type = PACKET_LOOPBACK;
 122        newskb->ip_summed = CHECKSUM_UNNECESSARY;
 123        BUG_TRAP(newskb->dst);
 124        netif_rx(newskb);
 125        return 0;
 126}
 127
 128static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 129{
 130        int ttl = inet->uc_ttl;
 131
 132        if (ttl < 0)
 133                ttl = dst_metric(dst, RTAX_HOPLIMIT);
 134        return ttl;
 135}
 136
 137/*
 138 *              Add an ip header to a skbuff and send it out.
 139 *
 140 */
 141int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 142                          __be32 saddr, __be32 daddr, struct ip_options *opt)
 143{
 144        struct inet_sock *inet = inet_sk(sk);
 145        struct rtable *rt = (struct rtable *)skb->dst;
 146        struct iphdr *iph;
 147
 148        /* Build the IP header. */
 149        skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 150        skb_reset_network_header(skb);
 151        iph = ip_hdr(skb);
 152        iph->version  = 4;
 153        iph->ihl      = 5;
 154        iph->tos      = inet->tos;
 155        if (ip_dont_fragment(sk, &rt->u.dst))
 156                iph->frag_off = htons(IP_DF);
 157        else
 158                iph->frag_off = 0;
 159        iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 160        iph->daddr    = rt->rt_dst;
 161        iph->saddr    = rt->rt_src;
 162        iph->protocol = sk->sk_protocol;
 163        ip_select_ident(iph, &rt->u.dst, sk);
 164
 165        if (opt && opt->optlen) {
 166                iph->ihl += opt->optlen>>2;
 167                ip_options_build(skb, opt, daddr, rt, 0);
 168        }
 169
 170        skb->priority = sk->sk_priority;
 171        skb->mark = sk->sk_mark;
 172
 173        /* Send it out. */
 174        return ip_local_out(skb);
 175}
 176
 177EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 178
 179static inline int ip_finish_output2(struct sk_buff *skb)
 180{
 181        struct dst_entry *dst = skb->dst;
 182        struct rtable *rt = (struct rtable *)dst;
 183        struct net_device *dev = dst->dev;
 184        unsigned int hh_len = LL_RESERVED_SPACE(dev);
 185
 186        if (rt->rt_type == RTN_MULTICAST)
 187                IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
 188        else if (rt->rt_type == RTN_BROADCAST)
 189                IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS);
 190
 191        /* Be paranoid, rather than too clever. */
 192        if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
 193                struct sk_buff *skb2;
 194
 195                skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 196                if (skb2 == NULL) {
 197                        kfree_skb(skb);
 198                        return -ENOMEM;
 199                }
 200                if (skb->sk)
 201                        skb_set_owner_w(skb2, skb->sk);
 202                kfree_skb(skb);
 203                skb = skb2;
 204        }
 205
 206        if (dst->hh)
 207                return neigh_hh_output(dst->hh, skb);
 208        else if (dst->neighbour)
 209                return dst->neighbour->output(skb);
 210
 211        if (net_ratelimit())
 212                printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 213        kfree_skb(skb);
 214        return -EINVAL;
 215}
 216
 217static inline int ip_skb_dst_mtu(struct sk_buff *skb)
 218{
 219        struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
 220
 221        return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
 222               skb->dst->dev->mtu : dst_mtu(skb->dst);
 223}
 224
 225static int ip_finish_output(struct sk_buff *skb)
 226{
 227#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 228        /* Policy lookup after SNAT yielded a new policy */
 229        if (skb->dst->xfrm != NULL) {
 230                IPCB(skb)->flags |= IPSKB_REROUTED;
 231                return dst_output(skb);
 232        }
 233#endif
 234        if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
 235                return ip_fragment(skb, ip_finish_output2);
 236        else
 237                return ip_finish_output2(skb);
 238}
 239
 240int ip_mc_output(struct sk_buff *skb)
 241{
 242        struct sock *sk = skb->sk;
 243        struct rtable *rt = (struct rtable*)skb->dst;
 244        struct net_device *dev = rt->u.dst.dev;
 245
 246        /*
 247         *      If the indicated interface is up and running, send the packet.
 248         */
 249        IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 250
 251        skb->dev = dev;
 252        skb->protocol = htons(ETH_P_IP);
 253
 254        /*
 255         *      Multicasts are looped back for other local users
 256         */
 257
 258        if (rt->rt_flags&RTCF_MULTICAST) {
 259                if ((!sk || inet_sk(sk)->mc_loop)
 260#ifdef CONFIG_IP_MROUTE
 261                /* Small optimization: do not loopback not local frames,
 262                   which returned after forwarding; they will be  dropped
 263                   by ip_mr_input in any case.
 264                   Note, that local frames are looped back to be delivered
 265                   to local recipients.
 266
 267                   This check is duplicated in ip_mr_input at the moment.
 268                 */
 269                    && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 270#endif
 271                ) {
 272                        struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 273                        if (newskb)
 274                                NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb,
 275                                        NULL, newskb->dev,
 276                                        ip_dev_loopback_xmit);
 277                }
 278
 279                /* Multicasts with ttl 0 must not go beyond the host */
 280
 281                if (ip_hdr(skb)->ttl == 0) {
 282                        kfree_skb(skb);
 283                        return 0;
 284                }
 285        }
 286
 287        if (rt->rt_flags&RTCF_BROADCAST) {
 288                struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 289                if (newskb)
 290                        NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, NULL,
 291                                newskb->dev, ip_dev_loopback_xmit);
 292        }
 293
 294        return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
 295                            ip_finish_output,
 296                            !(IPCB(skb)->flags & IPSKB_REROUTED));
 297}
 298
 299int ip_output(struct sk_buff *skb)
 300{
 301        struct net_device *dev = skb->dst->dev;
 302
 303        IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 304
 305        skb->dev = dev;
 306        skb->protocol = htons(ETH_P_IP);
 307
 308        return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev,
 309                            ip_finish_output,
 310                            !(IPCB(skb)->flags & IPSKB_REROUTED));
 311}
 312
 313int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 314{
 315        struct sock *sk = skb->sk;
 316        struct inet_sock *inet = inet_sk(sk);
 317        struct ip_options *opt = inet->opt;
 318        struct rtable *rt;
 319        struct iphdr *iph;
 320
 321        /* Skip all of this if the packet is already routed,
 322         * f.e. by something like SCTP.
 323         */
 324        rt = (struct rtable *) skb->dst;
 325        if (rt != NULL)
 326                goto packet_routed;
 327
 328        /* Make sure we can route this packet. */
 329        rt = (struct rtable *)__sk_dst_check(sk, 0);
 330        if (rt == NULL) {
 331                __be32 daddr;
 332
 333                /* Use correct destination address if we have options. */
 334                daddr = inet->daddr;
 335                if(opt && opt->srr)
 336                        daddr = opt->faddr;
 337
 338                {
 339                        struct flowi fl = { .oif = sk->sk_bound_dev_if,
 340                                            .nl_u = { .ip4_u =
 341                                                      { .daddr = daddr,
 342                                                        .saddr = inet->saddr,
 343                                                        .tos = RT_CONN_FLAGS(sk) } },
 344                                            .proto = sk->sk_protocol,
 345                                            .uli_u = { .ports =
 346                                                       { .sport = inet->sport,
 347                                                         .dport = inet->dport } } };
 348
 349                        /* If this fails, retransmit mechanism of transport layer will
 350                         * keep trying until route appears or the connection times
 351                         * itself out.
 352                         */
 353                        security_sk_classify_flow(sk, &fl);
 354                        if (ip_route_output_flow(&init_net, &rt, &fl, sk, 0))
 355                                goto no_route;
 356                }
 357                sk_setup_caps(sk, &rt->u.dst);
 358        }
 359        skb->dst = dst_clone(&rt->u.dst);
 360
 361packet_routed:
 362        if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 363                goto no_route;
 364
 365        /* OK, we know where to send it, allocate and build IP header. */
 366        skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 367        skb_reset_network_header(skb);
 368        iph = ip_hdr(skb);
 369        *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 370        if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
 371                iph->frag_off = htons(IP_DF);
 372        else
 373                iph->frag_off = 0;
 374        iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 375        iph->protocol = sk->sk_protocol;
 376        iph->saddr    = rt->rt_src;
 377        iph->daddr    = rt->rt_dst;
 378        /* Transport layer set skb->h.foo itself. */
 379
 380        if (opt && opt->optlen) {
 381                iph->ihl += opt->optlen >> 2;
 382                ip_options_build(skb, opt, inet->daddr, rt, 0);
 383        }
 384
 385        ip_select_ident_more(iph, &rt->u.dst, sk,
 386                             (skb_shinfo(skb)->gso_segs ?: 1) - 1);
 387
 388        skb->priority = sk->sk_priority;
 389        skb->mark = sk->sk_mark;
 390
 391        return ip_local_out(skb);
 392
 393no_route:
 394        IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 395        kfree_skb(skb);
 396        return -EHOSTUNREACH;
 397}
 398
 399
 400static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 401{
 402        to->pkt_type = from->pkt_type;
 403        to->priority = from->priority;
 404        to->protocol = from->protocol;
 405        dst_release(to->dst);
 406        to->dst = dst_clone(from->dst);
 407        to->dev = from->dev;
 408        to->mark = from->mark;
 409
 410        /* Copy the flags to each fragment. */
 411        IPCB(to)->flags = IPCB(from)->flags;
 412
 413#ifdef CONFIG_NET_SCHED
 414        to->tc_index = from->tc_index;
 415#endif
 416        nf_copy(to, from);
 417#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 418    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 419        to->nf_trace = from->nf_trace;
 420#endif
 421#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 422        to->ipvs_property = from->ipvs_property;
 423#endif
 424        skb_copy_secmark(to, from);
 425}
 426
 427/*
 428 *      This IP datagram is too large to be sent in one piece.  Break it up into
 429 *      smaller pieces (each of size equal to IP header plus
 430 *      a block of the data of the original IP data part) that will yet fit in a
 431 *      single device frame, and queue such a frame for sending.
 432 */
 433
 434int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 435{
 436        struct iphdr *iph;
 437        int raw = 0;
 438        int ptr;
 439        struct net_device *dev;
 440        struct sk_buff *skb2;
 441        unsigned int mtu, hlen, left, len, ll_rs, pad;
 442        int offset;
 443        __be16 not_last_frag;
 444        struct rtable *rt = (struct rtable*)skb->dst;
 445        int err = 0;
 446
 447        dev = rt->u.dst.dev;
 448
 449        /*
 450         *      Point into the IP datagram header.
 451         */
 452
 453        iph = ip_hdr(skb);
 454
 455        if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 456                IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 457                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 458                          htonl(ip_skb_dst_mtu(skb)));
 459                kfree_skb(skb);
 460                return -EMSGSIZE;
 461        }
 462
 463        /*
 464         *      Setup starting values.
 465         */
 466
 467        hlen = iph->ihl * 4;
 468        mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
 469        IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 470
 471        /* When frag_list is given, use it. First, check its validity:
 472         * some transformers could create wrong frag_list or break existing
 473         * one, it is not prohibited. In this case fall back to copying.
 474         *
 475         * LATER: this step can be merged to real generation of fragments,
 476         * we can switch to copy when see the first bad fragment.
 477         */
 478        if (skb_shinfo(skb)->frag_list) {
 479                struct sk_buff *frag;
 480                int first_len = skb_pagelen(skb);
 481                int truesizes = 0;
 482
 483                if (first_len - hlen > mtu ||
 484                    ((first_len - hlen) & 7) ||
 485                    (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
 486                    skb_cloned(skb))
 487                        goto slow_path;
 488
 489                for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 490                        /* Correct geometry. */
 491                        if (frag->len > mtu ||
 492                            ((frag->len & 7) && frag->next) ||
 493                            skb_headroom(frag) < hlen)
 494                            goto slow_path;
 495
 496                        /* Partially cloned skb? */
 497                        if (skb_shared(frag))
 498                                goto slow_path;
 499
 500                        BUG_ON(frag->sk);
 501                        if (skb->sk) {
 502                                sock_hold(skb->sk);
 503                                frag->sk = skb->sk;
 504                                frag->destructor = sock_wfree;
 505                                truesizes += frag->truesize;
 506                        }
 507                }
 508
 509                /* Everything is OK. Generate! */
 510
 511                err = 0;
 512                offset = 0;
 513                frag = skb_shinfo(skb)->frag_list;
 514                skb_shinfo(skb)->frag_list = NULL;
 515                skb->data_len = first_len - skb_headlen(skb);
 516                skb->truesize -= truesizes;
 517                skb->len = first_len;
 518                iph->tot_len = htons(first_len);
 519                iph->frag_off = htons(IP_MF);
 520                ip_send_check(iph);
 521
 522                for (;;) {
 523                        /* Prepare header of the next frame,
 524                         * before previous one went down. */
 525                        if (frag) {
 526                                frag->ip_summed = CHECKSUM_NONE;
 527                                skb_reset_transport_header(frag);
 528                                __skb_push(frag, hlen);
 529                                skb_reset_network_header(frag);
 530                                memcpy(skb_network_header(frag), iph, hlen);
 531                                iph = ip_hdr(frag);
 532                                iph->tot_len = htons(frag->len);
 533                                ip_copy_metadata(frag, skb);
 534                                if (offset == 0)
 535                                        ip_options_fragment(frag);
 536                                offset += skb->len - hlen;
 537                                iph->frag_off = htons(offset>>3);
 538                                if (frag->next != NULL)
 539                                        iph->frag_off |= htons(IP_MF);
 540                                /* Ready, complete checksum */
 541                                ip_send_check(iph);
 542                        }
 543
 544                        err = output(skb);
 545
 546                        if (!err)
 547                                IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 548                        if (err || !frag)
 549                                break;
 550
 551                        skb = frag;
 552                        frag = skb->next;
 553                        skb->next = NULL;
 554                }
 555
 556                if (err == 0) {
 557                        IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 558                        return 0;
 559                }
 560
 561                while (frag) {
 562                        skb = frag->next;
 563                        kfree_skb(frag);
 564                        frag = skb;
 565                }
 566                IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 567                return err;
 568        }
 569
 570slow_path:
 571        left = skb->len - hlen;         /* Space per frame */
 572        ptr = raw + hlen;               /* Where to start from */
 573
 574        /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 575         * we need to make room for the encapsulating header
 576         */
 577        pad = nf_bridge_pad(skb);
 578        ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
 579        mtu -= pad;
 580
 581        /*
 582         *      Fragment the datagram.
 583         */
 584
 585        offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 586        not_last_frag = iph->frag_off & htons(IP_MF);
 587
 588        /*
 589         *      Keep copying data until we run out.
 590         */
 591
 592        while (left > 0) {
 593                len = left;
 594                /* IF: it doesn't fit, use 'mtu' - the data space left */
 595                if (len > mtu)
 596                        len = mtu;
 597                /* IF: we are not sending upto and including the packet end
 598                   then align the next start on an eight byte boundary */
 599                if (len < left) {
 600                        len &= ~7;
 601                }
 602                /*
 603                 *      Allocate buffer.
 604                 */
 605
 606                if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 607                        NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
 608                        err = -ENOMEM;
 609                        goto fail;
 610                }
 611
 612                /*
 613                 *      Set up data on packet
 614                 */
 615
 616                ip_copy_metadata(skb2, skb);
 617                skb_reserve(skb2, ll_rs);
 618                skb_put(skb2, len + hlen);
 619                skb_reset_network_header(skb2);
 620                skb2->transport_header = skb2->network_header + hlen;
 621
 622                /*
 623                 *      Charge the memory for the fragment to any owner
 624                 *      it might possess
 625                 */
 626
 627                if (skb->sk)
 628                        skb_set_owner_w(skb2, skb->sk);
 629
 630                /*
 631                 *      Copy the packet header into the new buffer.
 632                 */
 633
 634                skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
 635
 636                /*
 637                 *      Copy a block of the IP datagram.
 638                 */
 639                if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
 640                        BUG();
 641                left -= len;
 642
 643                /*
 644                 *      Fill in the new header fields.
 645                 */
 646                iph = ip_hdr(skb2);
 647                iph->frag_off = htons((offset >> 3));
 648
 649                /* ANK: dirty, but effective trick. Upgrade options only if
 650                 * the segment to be fragmented was THE FIRST (otherwise,
 651                 * options are already fixed) and make it ONCE
 652                 * on the initial skb, so that all the following fragments
 653                 * will inherit fixed options.
 654                 */
 655                if (offset == 0)
 656                        ip_options_fragment(skb);
 657
 658                /*
 659                 *      Added AC : If we are fragmenting a fragment that's not the
 660                 *                 last fragment then keep MF on each bit
 661                 */
 662                if (left > 0 || not_last_frag)
 663                        iph->frag_off |= htons(IP_MF);
 664                ptr += len;
 665                offset += len;
 666
 667                /*
 668                 *      Put this fragment into the sending queue.
 669                 */
 670                iph->tot_len = htons(len + hlen);
 671
 672                ip_send_check(iph);
 673
 674                err = output(skb2);
 675                if (err)
 676                        goto fail;
 677
 678                IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 679        }
 680        kfree_skb(skb);
 681        IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 682        return err;
 683
 684fail:
 685        kfree_skb(skb);
 686        IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 687        return err;
 688}
 689
 690EXPORT_SYMBOL(ip_fragment);
 691
 692int
 693ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 694{
 695        struct iovec *iov = from;
 696
 697        if (skb->ip_summed == CHECKSUM_PARTIAL) {
 698                if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 699                        return -EFAULT;
 700        } else {
 701                __wsum csum = 0;
 702                if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 703                        return -EFAULT;
 704                skb->csum = csum_block_add(skb->csum, csum, odd);
 705        }
 706        return 0;
 707}
 708
 709static inline __wsum
 710csum_page(struct page *page, int offset, int copy)
 711{
 712        char *kaddr;
 713        __wsum csum;
 714        kaddr = kmap(page);
 715        csum = csum_partial(kaddr + offset, copy, 0);
 716        kunmap(page);
 717        return csum;
 718}
 719
 720static inline int ip_ufo_append_data(struct sock *sk,
 721                        int getfrag(void *from, char *to, int offset, int len,
 722                               int odd, struct sk_buff *skb),
 723                        void *from, int length, int hh_len, int fragheaderlen,
 724                        int transhdrlen, int mtu,unsigned int flags)
 725{
 726        struct sk_buff *skb;
 727        int err;
 728
 729        /* There is support for UDP fragmentation offload by network
 730         * device, so create one single skb packet containing complete
 731         * udp datagram
 732         */
 733        if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
 734                skb = sock_alloc_send_skb(sk,
 735                        hh_len + fragheaderlen + transhdrlen + 20,
 736                        (flags & MSG_DONTWAIT), &err);
 737
 738                if (skb == NULL)
 739                        return err;
 740
 741                /* reserve space for Hardware header */
 742                skb_reserve(skb, hh_len);
 743
 744                /* create space for UDP/IP header */
 745                skb_put(skb,fragheaderlen + transhdrlen);
 746
 747                /* initialize network header pointer */
 748                skb_reset_network_header(skb);
 749
 750                /* initialize protocol header pointer */
 751                skb->transport_header = skb->network_header + fragheaderlen;
 752
 753                skb->ip_summed = CHECKSUM_PARTIAL;
 754                skb->csum = 0;
 755                sk->sk_sndmsg_off = 0;
 756        }
 757
 758        err = skb_append_datato_frags(sk,skb, getfrag, from,
 759                               (length - transhdrlen));
 760        if (!err) {
 761                /* specify the length of each IP datagram fragment*/
 762                skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
 763                skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
 764                __skb_queue_tail(&sk->sk_write_queue, skb);
 765
 766                return 0;
 767        }
 768        /* There is not enough support do UFO ,
 769         * so follow normal path
 770         */
 771        kfree_skb(skb);
 772        return err;
 773}
 774
 775/*
 776 *      ip_append_data() and ip_append_page() can make one large IP datagram
 777 *      from many pieces of data. Each pieces will be holded on the socket
 778 *      until ip_push_pending_frames() is called. Each piece can be a page
 779 *      or non-page data.
 780 *
 781 *      Not only UDP, other transport protocols - e.g. raw sockets - can use
 782 *      this interface potentially.
 783 *
 784 *      LATER: length must be adjusted by pad at tail, when it is required.
 785 */
 786int ip_append_data(struct sock *sk,
 787                   int getfrag(void *from, char *to, int offset, int len,
 788                               int odd, struct sk_buff *skb),
 789                   void *from, int length, int transhdrlen,
 790                   struct ipcm_cookie *ipc, struct rtable *rt,
 791                   unsigned int flags)
 792{
 793        struct inet_sock *inet = inet_sk(sk);
 794        struct sk_buff *skb;
 795
 796        struct ip_options *opt = NULL;
 797        int hh_len;
 798        int exthdrlen;
 799        int mtu;
 800        int copy;
 801        int err;
 802        int offset = 0;
 803        unsigned int maxfraglen, fragheaderlen;
 804        int csummode = CHECKSUM_NONE;
 805
 806        if (flags&MSG_PROBE)
 807                return 0;
 808
 809        if (skb_queue_empty(&sk->sk_write_queue)) {
 810                /*
 811                 * setup for corking.
 812                 */
 813                opt = ipc->opt;
 814                if (opt) {
 815                        if (inet->cork.opt == NULL) {
 816                                inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
 817                                if (unlikely(inet->cork.opt == NULL))
 818                                        return -ENOBUFS;
 819                        }
 820                        memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
 821                        inet->cork.flags |= IPCORK_OPT;
 822                        inet->cork.addr = ipc->addr;
 823                }
 824                dst_hold(&rt->u.dst);
 825                inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
 826                                            rt->u.dst.dev->mtu :
 827                                            dst_mtu(rt->u.dst.path);
 828                inet->cork.rt = rt;
 829                inet->cork.length = 0;
 830                sk->sk_sndmsg_page = NULL;
 831                sk->sk_sndmsg_off = 0;
 832                if ((exthdrlen = rt->u.dst.header_len) != 0) {
 833                        length += exthdrlen;
 834                        transhdrlen += exthdrlen;
 835                }
 836        } else {
 837                rt = inet->cork.rt;
 838                if (inet->cork.flags & IPCORK_OPT)
 839                        opt = inet->cork.opt;
 840
 841                transhdrlen = 0;
 842                exthdrlen = 0;
 843                mtu = inet->cork.fragsize;
 844        }
 845        hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 846
 847        fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 848        maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 849
 850        if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
 851                ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
 852                return -EMSGSIZE;
 853        }
 854
 855        /*
 856         * transhdrlen > 0 means that this is the first fragment and we wish
 857         * it won't be fragmented in the future.
 858         */
 859        if (transhdrlen &&
 860            length + fragheaderlen <= mtu &&
 861            rt->u.dst.dev->features & NETIF_F_V4_CSUM &&
 862            !exthdrlen)
 863                csummode = CHECKSUM_PARTIAL;
 864
 865        inet->cork.length += length;
 866        if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
 867                        (rt->u.dst.dev->features & NETIF_F_UFO)) {
 868
 869                err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
 870                                         fragheaderlen, transhdrlen, mtu,
 871                                         flags);
 872                if (err)
 873                        goto error;
 874                return 0;
 875        }
 876
 877        /* So, what's going on in the loop below?
 878         *
 879         * We use calculated fragment length to generate chained skb,
 880         * each of segments is IP fragment ready for sending to network after
 881         * adding appropriate IP header.
 882         */
 883
 884        if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 885                goto alloc_new_skb;
 886
 887        while (length > 0) {
 888                /* Check if the remaining data fits into current packet. */
 889                copy = mtu - skb->len;
 890                if (copy < length)
 891                        copy = maxfraglen - skb->len;
 892                if (copy <= 0) {
 893                        char *data;
 894                        unsigned int datalen;
 895                        unsigned int fraglen;
 896                        unsigned int fraggap;
 897                        unsigned int alloclen;
 898                        struct sk_buff *skb_prev;
 899alloc_new_skb:
 900                        skb_prev = skb;
 901                        if (skb_prev)
 902                                fraggap = skb_prev->len - maxfraglen;
 903                        else
 904                                fraggap = 0;
 905
 906                        /*
 907                         * If remaining data exceeds the mtu,
 908                         * we know we need more fragment(s).
 909                         */
 910                        datalen = length + fraggap;
 911                        if (datalen > mtu - fragheaderlen)
 912                                datalen = maxfraglen - fragheaderlen;
 913                        fraglen = datalen + fragheaderlen;
 914
 915                        if ((flags & MSG_MORE) &&
 916                            !(rt->u.dst.dev->features&NETIF_F_SG))
 917                                alloclen = mtu;
 918                        else
 919                                alloclen = datalen + fragheaderlen;
 920
 921                        /* The last fragment gets additional space at tail.
 922                         * Note, with MSG_MORE we overallocate on fragments,
 923                         * because we have no idea what fragment will be
 924                         * the last.
 925                         */
 926                        if (datalen == length + fraggap)
 927                                alloclen += rt->u.dst.trailer_len;
 928
 929                        if (transhdrlen) {
 930                                skb = sock_alloc_send_skb(sk,
 931                                                alloclen + hh_len + 15,
 932                                                (flags & MSG_DONTWAIT), &err);
 933                        } else {
 934                                skb = NULL;
 935                                if (atomic_read(&sk->sk_wmem_alloc) <=
 936                                    2 * sk->sk_sndbuf)
 937                                        skb = sock_wmalloc(sk,
 938                                                           alloclen + hh_len + 15, 1,
 939                                                           sk->sk_allocation);
 940                                if (unlikely(skb == NULL))
 941                                        err = -ENOBUFS;
 942                        }
 943                        if (skb == NULL)
 944                                goto error;
 945
 946                        /*
 947                         *      Fill in the control structures
 948                         */
 949                        skb->ip_summed = csummode;
 950                        skb->csum = 0;
 951                        skb_reserve(skb, hh_len);
 952
 953                        /*
 954                         *      Find where to start putting bytes.
 955                         */
 956                        data = skb_put(skb, fraglen);
 957                        skb_set_network_header(skb, exthdrlen);
 958                        skb->transport_header = (skb->network_header +
 959                                                 fragheaderlen);
 960                        data += fragheaderlen;
 961
 962                        if (fraggap) {
 963                                skb->csum = skb_copy_and_csum_bits(
 964                                        skb_prev, maxfraglen,
 965                                        data + transhdrlen, fraggap, 0);
 966                                skb_prev->csum = csum_sub(skb_prev->csum,
 967                                                          skb->csum);
 968                                data += fraggap;
 969                                pskb_trim_unique(skb_prev, maxfraglen);
 970                        }
 971
 972                        copy = datalen - transhdrlen - fraggap;
 973                        if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 974                                err = -EFAULT;
 975                                kfree_skb(skb);
 976                                goto error;
 977                        }
 978
 979                        offset += copy;
 980                        length -= datalen - fraggap;
 981                        transhdrlen = 0;
 982                        exthdrlen = 0;
 983                        csummode = CHECKSUM_NONE;
 984
 985                        /*
 986                         * Put the packet on the pending queue.
 987                         */
 988                        __skb_queue_tail(&sk->sk_write_queue, skb);
 989                        continue;
 990                }
 991
 992                if (copy > length)
 993                        copy = length;
 994
 995                if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
 996                        unsigned int off;
 997
 998                        off = skb->len;
 999                        if (getfrag(from, skb_put(skb, copy),
1000                                        offset, copy, off, skb) < 0) {
1001                                __skb_trim(skb, off);
1002                                err = -EFAULT;
1003                                goto error;
1004                        }
1005                } else {
1006                        int i = skb_shinfo(skb)->nr_frags;
1007                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1008                        struct page *page = sk->sk_sndmsg_page;
1009                        int off = sk->sk_sndmsg_off;
1010                        unsigned int left;
1011
1012                        if (page && (left = PAGE_SIZE - off) > 0) {
1013                                if (copy >= left)
1014                                        copy = left;
1015                                if (page != frag->page) {
1016                                        if (i == MAX_SKB_FRAGS) {
1017                                                err = -EMSGSIZE;
1018                                                goto error;
1019                                        }
1020                                        get_page(page);
1021                                        skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1022                                        frag = &skb_shinfo(skb)->frags[i];
1023                                }
1024                        } else if (i < MAX_SKB_FRAGS) {
1025                                if (copy > PAGE_SIZE)
1026                                        copy = PAGE_SIZE;
1027                                page = alloc_pages(sk->sk_allocation, 0);
1028                                if (page == NULL)  {
1029                                        err = -ENOMEM;
1030                                        goto error;
1031                                }
1032                                sk->sk_sndmsg_page = page;
1033                                sk->sk_sndmsg_off = 0;
1034
1035                                skb_fill_page_desc(skb, i, page, 0, 0);
1036                                frag = &skb_shinfo(skb)->frags[i];
1037                        } else {
1038                                err = -EMSGSIZE;
1039                                goto error;
1040                        }
1041                        if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1042                                err = -EFAULT;
1043                                goto error;
1044                        }
1045                        sk->sk_sndmsg_off += copy;
1046                        frag->size += copy;
1047                        skb->len += copy;
1048                        skb->data_len += copy;
1049                        skb->truesize += copy;
1050                        atomic_add(copy, &sk->sk_wmem_alloc);
1051                }
1052                offset += copy;
1053                length -= copy;
1054        }
1055
1056        return 0;
1057
1058error:
1059        inet->cork.length -= length;
1060        IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1061        return err;
1062}
1063
1064ssize_t ip_append_page(struct sock *sk, struct page *page,
1065                       int offset, size_t size, int flags)
1066{
1067        struct inet_sock *inet = inet_sk(sk);
1068        struct sk_buff *skb;
1069        struct rtable *rt;
1070        struct ip_options *opt = NULL;
1071        int hh_len;
1072        int mtu;
1073        int len;
1074        int err;
1075        unsigned int maxfraglen, fragheaderlen, fraggap;
1076
1077        if (inet->hdrincl)
1078                return -EPERM;
1079
1080        if (flags&MSG_PROBE)
1081                return 0;
1082
1083        if (skb_queue_empty(&sk->sk_write_queue))
1084                return -EINVAL;
1085
1086        rt = inet->cork.rt;
1087        if (inet->cork.flags & IPCORK_OPT)
1088                opt = inet->cork.opt;
1089
1090        if (!(rt->u.dst.dev->features&NETIF_F_SG))
1091                return -EOPNOTSUPP;
1092
1093        hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1094        mtu = inet->cork.fragsize;
1095
1096        fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1097        maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1098
1099        if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1100                ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1101                return -EMSGSIZE;
1102        }
1103
1104        if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1105                return -EINVAL;
1106
1107        inet->cork.length += size;
1108        if ((sk->sk_protocol == IPPROTO_UDP) &&
1109            (rt->u.dst.dev->features & NETIF_F_UFO)) {
1110                skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1111                skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1112        }
1113
1114
1115        while (size > 0) {
1116                int i;
1117
1118                if (skb_is_gso(skb))
1119                        len = size;
1120                else {
1121
1122                        /* Check if the remaining data fits into current packet. */
1123                        len = mtu - skb->len;
1124                        if (len < size)
1125                                len = maxfraglen - skb->len;
1126                }
1127                if (len <= 0) {
1128                        struct sk_buff *skb_prev;
1129                        int alloclen;
1130
1131                        skb_prev = skb;
1132                        fraggap = skb_prev->len - maxfraglen;
1133
1134                        alloclen = fragheaderlen + hh_len + fraggap + 15;
1135                        skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1136                        if (unlikely(!skb)) {
1137                                err = -ENOBUFS;
1138                                goto error;
1139                        }
1140
1141                        /*
1142                         *      Fill in the control structures
1143                         */
1144                        skb->ip_summed = CHECKSUM_NONE;
1145                        skb->csum = 0;
1146                        skb_reserve(skb, hh_len);
1147
1148                        /*
1149                         *      Find where to start putting bytes.
1150                         */
1151                        skb_put(skb, fragheaderlen + fraggap);
1152                        skb_reset_network_header(skb);
1153                        skb->transport_header = (skb->network_header +
1154                                                 fragheaderlen);
1155                        if (fraggap) {
1156                                skb->csum = skb_copy_and_csum_bits(skb_prev,
1157                                                                   maxfraglen,
1158                                                    skb_transport_header(skb),
1159                                                                   fraggap, 0);
1160                                skb_prev->csum = csum_sub(skb_prev->csum,
1161                                                          skb->csum);
1162                                pskb_trim_unique(skb_prev, maxfraglen);
1163                        }
1164
1165                        /*
1166                         * Put the packet on the pending queue.
1167                         */
1168                        __skb_queue_tail(&sk->sk_write_queue, skb);
1169                        continue;
1170                }
1171
1172                i = skb_shinfo(skb)->nr_frags;
1173                if (len > size)
1174                        len = size;
1175                if (skb_can_coalesce(skb, i, page, offset)) {
1176                        skb_shinfo(skb)->frags[i-1].size += len;
1177                } else if (i < MAX_SKB_FRAGS) {
1178                        get_page(page);
1179                        skb_fill_page_desc(skb, i, page, offset, len);
1180                } else {
1181                        err = -EMSGSIZE;
1182                        goto error;
1183                }
1184
1185                if (skb->ip_summed == CHECKSUM_NONE) {
1186                        __wsum csum;
1187                        csum = csum_page(page, offset, len);
1188                        skb->csum = csum_block_add(skb->csum, csum, skb->len);
1189                }
1190
1191                skb->len += len;
1192                skb->data_len += len;
1193                skb->truesize += len;
1194                atomic_add(len, &sk->sk_wmem_alloc);
1195                offset += len;
1196                size -= len;
1197        }
1198        return 0;
1199
1200error:
1201        inet->cork.length -= size;
1202        IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1203        return err;
1204}
1205
1206static void ip_cork_release(struct inet_sock *inet)
1207{
1208        inet->cork.flags &= ~IPCORK_OPT;
1209        kfree(inet->cork.opt);
1210        inet->cork.opt = NULL;
1211        if (inet->cork.rt) {
1212                ip_rt_put(inet->cork.rt);
1213                inet->cork.rt = NULL;
1214        }
1215}
1216
1217/*
1218 *      Combined all pending IP fragments on the socket as one IP datagram
1219 *      and push them out.
1220 */
1221int ip_push_pending_frames(struct sock *sk)
1222{
1223        struct sk_buff *skb, *tmp_skb;
1224        struct sk_buff **tail_skb;
1225        struct inet_sock *inet = inet_sk(sk);
1226        struct ip_options *opt = NULL;
1227        struct rtable *rt = inet->cork.rt;
1228        struct iphdr *iph;
1229        __be16 df = 0;
1230        __u8 ttl;
1231        int err = 0;
1232
1233        if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1234                goto out;
1235        tail_skb = &(skb_shinfo(skb)->frag_list);
1236
1237        /* move skb->data to ip header from ext header */
1238        if (skb->data < skb_network_header(skb))
1239                __skb_pull(skb, skb_network_offset(skb));
1240        while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1241                __skb_pull(tmp_skb, skb_network_header_len(skb));
1242                *tail_skb = tmp_skb;
1243                tail_skb = &(tmp_skb->next);
1244                skb->len += tmp_skb->len;
1245                skb->data_len += tmp_skb->len;
1246                skb->truesize += tmp_skb->truesize;
1247                __sock_put(tmp_skb->sk);
1248                tmp_skb->destructor = NULL;
1249                tmp_skb->sk = NULL;
1250        }
1251
1252        /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1253         * to fragment the frame generated here. No matter, what transforms
1254         * how transforms change size of the packet, it will come out.
1255         */
1256        if (inet->pmtudisc < IP_PMTUDISC_DO)
1257                skb->local_df = 1;
1258
1259        /* DF bit is set when we want to see DF on outgoing frames.
1260         * If local_df is set too, we still allow to fragment this frame
1261         * locally. */
1262        if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1263            (skb->len <= dst_mtu(&rt->u.dst) &&
1264             ip_dont_fragment(sk, &rt->u.dst)))
1265                df = htons(IP_DF);
1266
1267        if (inet->cork.flags & IPCORK_OPT)
1268                opt = inet->cork.opt;
1269
1270        if (rt->rt_type == RTN_MULTICAST)
1271                ttl = inet->mc_ttl;
1272        else
1273                ttl = ip_select_ttl(inet, &rt->u.dst);
1274
1275        iph = (struct iphdr *)skb->data;
1276        iph->version = 4;
1277        iph->ihl = 5;
1278        if (opt) {
1279                iph->ihl += opt->optlen>>2;
1280                ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1281        }
1282        iph->tos = inet->tos;
1283        iph->frag_off = df;
1284        ip_select_ident(iph, &rt->u.dst, sk);
1285        iph->ttl = ttl;
1286        iph->protocol = sk->sk_protocol;
1287        iph->saddr = rt->rt_src;
1288        iph->daddr = rt->rt_dst;
1289
1290        skb->priority = sk->sk_priority;
1291        skb->mark = sk->sk_mark;
1292        skb->dst = dst_clone(&rt->u.dst);
1293
1294        if (iph->protocol == IPPROTO_ICMP)
1295                icmp_out_count(((struct icmphdr *)
1296                        skb_transport_header(skb))->type);
1297
1298        /* Netfilter gets whole the not fragmented skb. */
1299        err = ip_local_out(skb);
1300        if (err) {
1301                if (err > 0)
1302                        err = inet->recverr ? net_xmit_errno(err) : 0;
1303                if (err)
1304                        goto error;
1305        }
1306
1307out:
1308        ip_cork_release(inet);
1309        return err;
1310
1311error:
1312        IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1313        goto out;
1314}
1315
1316/*
1317 *      Throw away all pending data on the socket.
1318 */
1319void ip_flush_pending_frames(struct sock *sk)
1320{
1321        struct sk_buff *skb;
1322
1323        while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1324                kfree_skb(skb);
1325
1326        ip_cork_release(inet_sk(sk));
1327}
1328
1329
1330/*
1331 *      Fetch data from kernel space and fill in checksum if needed.
1332 */
1333static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1334                              int len, int odd, struct sk_buff *skb)
1335{
1336        __wsum csum;
1337
1338        csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1339        skb->csum = csum_block_add(skb->csum, csum, odd);
1340        return 0;
1341}
1342
1343/*
1344 *      Generic function to send a packet as reply to another packet.
1345 *      Used to send TCP resets so far. ICMP should use this function too.
1346 *
1347 *      Should run single threaded per socket because it uses the sock
1348 *      structure to pass arguments.
1349 */
1350void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1351                   unsigned int len)
1352{
1353        struct inet_sock *inet = inet_sk(sk);
1354        struct {
1355                struct ip_options       opt;
1356                char                    data[40];
1357        } replyopts;
1358        struct ipcm_cookie ipc;
1359        __be32 daddr;
1360        struct rtable *rt = (struct rtable*)skb->dst;
1361
1362        if (ip_options_echo(&replyopts.opt, skb))
1363                return;
1364
1365        daddr = ipc.addr = rt->rt_src;
1366        ipc.opt = NULL;
1367
1368        if (replyopts.opt.optlen) {
1369                ipc.opt = &replyopts.opt;
1370
1371                if (ipc.opt->srr)
1372                        daddr = replyopts.opt.faddr;
1373        }
1374
1375        {
1376                struct flowi fl = { .oif = arg->bound_dev_if,
1377                                    .nl_u = { .ip4_u =
1378                                              { .daddr = daddr,
1379                                                .saddr = rt->rt_spec_dst,
1380                                                .tos = RT_TOS(ip_hdr(skb)->tos) } },
1381                                    /* Not quite clean, but right. */
1382                                    .uli_u = { .ports =
1383                                               { .sport = tcp_hdr(skb)->dest,
1384                                                 .dport = tcp_hdr(skb)->source } },
1385                                    .proto = sk->sk_protocol };
1386                security_skb_classify_flow(skb, &fl);
1387                if (ip_route_output_key(sk->sk_net, &rt, &fl))
1388                        return;
1389        }
1390
1391        /* And let IP do all the hard work.
1392
1393           This chunk is not reenterable, hence spinlock.
1394           Note that it uses the fact, that this function is called
1395           with locally disabled BH and that sk cannot be already spinlocked.
1396         */
1397        bh_lock_sock(sk);
1398        inet->tos = ip_hdr(skb)->tos;
1399        sk->sk_priority = skb->priority;
1400        sk->sk_protocol = ip_hdr(skb)->protocol;
1401        sk->sk_bound_dev_if = arg->bound_dev_if;
1402        ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1403                       &ipc, rt, MSG_DONTWAIT);
1404        if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1405                if (arg->csumoffset >= 0)
1406                        *((__sum16 *)skb_transport_header(skb) +
1407                          arg->csumoffset) = csum_fold(csum_add(skb->csum,
1408                                                                arg->csum));
1409                skb->ip_summed = CHECKSUM_NONE;
1410                ip_push_pending_frames(sk);
1411        }
1412
1413        bh_unlock_sock(sk);
1414
1415        ip_rt_put(rt);
1416}
1417
1418void __init ip_init(void)
1419{
1420        ip_rt_init();
1421        inet_initpeers();
1422
1423#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1424        igmp_mc_proc_init();
1425#endif
1426}
1427
1428EXPORT_SYMBOL(ip_generic_getfrag);
1429EXPORT_SYMBOL(ip_queue_xmit);
1430EXPORT_SYMBOL(ip_send_check);
1431
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.