linux/net/ipv4/ip_output.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              The Internet Protocol (IP) output module.
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Donald Becker, <becker@super.org>
  11 *              Alan Cox, <Alan.Cox@linux.org>
  12 *              Richard Underwood
  13 *              Stefan Becker, <stefanb@yello.ping.de>
  14 *              Jorge Cwik, <jorge@laser.satlink.net>
  15 *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  16 *              Hirokazu Takahashi, <taka@valinux.co.jp>
  17 *
  18 *      See ip_input.c for original log
  19 *
  20 *      Fixes:
  21 *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  22 *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  23 *              Bradford Johnson:       Fix faulty handling of some frames when
  24 *                                      no route is found.
  25 *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  26 *                                      (in case if packet not accepted by
  27 *                                      output firewall rules)
  28 *              Mike McLagan    :       Routing by source
  29 *              Alexey Kuznetsov:       use new route cache
  30 *              Andi Kleen:             Fix broken PMTU recovery and remove
  31 *                                      some redundant tests.
  32 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  33 *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  34 *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  35 *                                      for decreased register pressure on x86
  36 *                                      and more readibility.
  37 *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  38 *                                      silently drop skb instead of failing with -EPERM.
  39 *              Detlev Wengorz  :       Copy protocol for fragments.
  40 *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  41 *                                      datagrams.
  42 *              Hirokazu Takahashi:     sendfile() on UDP works now.
  43 */
  44
  45#include <asm/uaccess.h>
  46#include <asm/system.h>
  47#include <linux/module.h>
  48#include <linux/types.h>
  49#include <linux/kernel.h>
  50#include <linux/mm.h>
  51#include <linux/string.h>
  52#include <linux/errno.h>
  53#include <linux/highmem.h>
  54#include <linux/slab.h>
  55
  56#include <linux/socket.h>
  57#include <linux/sockios.h>
  58#include <linux/in.h>
  59#include <linux/inet.h>
  60#include <linux/netdevice.h>
  61#include <linux/etherdevice.h>
  62#include <linux/proc_fs.h>
  63#include <linux/stat.h>
  64#include <linux/init.h>
  65
  66#include <net/snmp.h>
  67#include <net/ip.h>
  68#include <net/protocol.h>
  69#include <net/route.h>
  70#include <net/xfrm.h>
  71#include <linux/skbuff.h>
  72#include <net/sock.h>
  73#include <net/arp.h>
  74#include <net/icmp.h>
  75#include <net/checksum.h>
  76#include <net/inetpeer.h>
  77#include <linux/igmp.h>
  78#include <linux/netfilter_ipv4.h>
  79#include <linux/netfilter_bridge.h>
  80#include <linux/mroute.h>
  81#include <linux/netlink.h>
  82#include <linux/tcp.h>
  83
  84int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
  85
  86/* Generate a checksum for an outgoing IP datagram. */
  87__inline__ void ip_send_check(struct iphdr *iph)
  88{
  89        iph->check = 0;
  90        iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  91}
  92
  93int __ip_local_out(struct sk_buff *skb)
  94{
  95        struct iphdr *iph = ip_hdr(skb);
  96
  97        iph->tot_len = htons(skb->len);
  98        ip_send_check(iph);
  99        return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
 100                       skb_dst(skb)->dev, dst_output);
 101}
 102
 103int ip_local_out(struct sk_buff *skb)
 104{
 105        int err;
 106
 107        err = __ip_local_out(skb);
 108        if (likely(err == 1))
 109                err = dst_output(skb);
 110
 111        return err;
 112}
 113EXPORT_SYMBOL_GPL(ip_local_out);
 114
 115/* dev_loopback_xmit for use with netfilter. */
 116static int ip_dev_loopback_xmit(struct sk_buff *newskb)
 117{
 118        skb_reset_mac_header(newskb);
 119        __skb_pull(newskb, skb_network_offset(newskb));
 120        newskb->pkt_type = PACKET_LOOPBACK;
 121        newskb->ip_summed = CHECKSUM_UNNECESSARY;
 122        WARN_ON(!skb_dst(newskb));
 123        netif_rx_ni(newskb);
 124        return 0;
 125}
 126
 127static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 128{
 129        int ttl = inet->uc_ttl;
 130
 131        if (ttl < 0)
 132                ttl = dst_metric(dst, RTAX_HOPLIMIT);
 133        return ttl;
 134}
 135
 136/*
 137 *              Add an ip header to a skbuff and send it out.
 138 *
 139 */
 140int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 141                          __be32 saddr, __be32 daddr, struct ip_options *opt)
 142{
 143        struct inet_sock *inet = inet_sk(sk);
 144        struct rtable *rt = skb_rtable(skb);
 145        struct iphdr *iph;
 146
 147        /* Build the IP header. */
 148        skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 149        skb_reset_network_header(skb);
 150        iph = ip_hdr(skb);
 151        iph->version  = 4;
 152        iph->ihl      = 5;
 153        iph->tos      = inet->tos;
 154        if (ip_dont_fragment(sk, &rt->u.dst))
 155                iph->frag_off = htons(IP_DF);
 156        else
 157                iph->frag_off = 0;
 158        iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 159        iph->daddr    = rt->rt_dst;
 160        iph->saddr    = rt->rt_src;
 161        iph->protocol = sk->sk_protocol;
 162        ip_select_ident(iph, &rt->u.dst, sk);
 163
 164        if (opt && opt->optlen) {
 165                iph->ihl += opt->optlen>>2;
 166                ip_options_build(skb, opt, daddr, rt, 0);
 167        }
 168
 169        skb->priority = sk->sk_priority;
 170        skb->mark = sk->sk_mark;
 171
 172        /* Send it out. */
 173        return ip_local_out(skb);
 174}
 175
 176EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 177
 178static inline int ip_finish_output2(struct sk_buff *skb)
 179{
 180        struct dst_entry *dst = skb_dst(skb);
 181        struct rtable *rt = (struct rtable *)dst;
 182        struct net_device *dev = dst->dev;
 183        unsigned int hh_len = LL_RESERVED_SPACE(dev);
 184
 185        if (rt->rt_type == RTN_MULTICAST) {
 186                IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
 187        } else if (rt->rt_type == RTN_BROADCAST)
 188                IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
 189
 190        /* Be paranoid, rather than too clever. */
 191        if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
 192                struct sk_buff *skb2;
 193
 194                skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 195                if (skb2 == NULL) {
 196                        kfree_skb(skb);
 197                        return -ENOMEM;
 198                }
 199                if (skb->sk)
 200                        skb_set_owner_w(skb2, skb->sk);
 201                kfree_skb(skb);
 202                skb = skb2;
 203        }
 204
 205        if (dst->hh)
 206                return neigh_hh_output(dst->hh, skb);
 207        else if (dst->neighbour)
 208                return dst->neighbour->output(skb);
 209
 210        if (net_ratelimit())
 211                printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 212        kfree_skb(skb);
 213        return -EINVAL;
 214}
 215
 216static inline int ip_skb_dst_mtu(struct sk_buff *skb)
 217{
 218        struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
 219
 220        return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
 221               skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
 222}
 223
 224static int ip_finish_output(struct sk_buff *skb)
 225{
 226#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 227        /* Policy lookup after SNAT yielded a new policy */
 228        if (skb_dst(skb)->xfrm != NULL) {
 229                IPCB(skb)->flags |= IPSKB_REROUTED;
 230                return dst_output(skb);
 231        }
 232#endif
 233        if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
 234                return ip_fragment(skb, ip_finish_output2);
 235        else
 236                return ip_finish_output2(skb);
 237}
 238
 239int ip_mc_output(struct sk_buff *skb)
 240{
 241        struct sock *sk = skb->sk;
 242        struct rtable *rt = skb_rtable(skb);
 243        struct net_device *dev = rt->u.dst.dev;
 244
 245        /*
 246         *      If the indicated interface is up and running, send the packet.
 247         */
 248        IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
 249
 250        skb->dev = dev;
 251        skb->protocol = htons(ETH_P_IP);
 252
 253        /*
 254         *      Multicasts are looped back for other local users
 255         */
 256
 257        if (rt->rt_flags&RTCF_MULTICAST) {
 258                if (sk_mc_loop(sk)
 259#ifdef CONFIG_IP_MROUTE
 260                /* Small optimization: do not loopback not local frames,
 261                   which returned after forwarding; they will be  dropped
 262                   by ip_mr_input in any case.
 263                   Note, that local frames are looped back to be delivered
 264                   to local recipients.
 265
 266                   This check is duplicated in ip_mr_input at the moment.
 267                 */
 268                    &&
 269                    ((rt->rt_flags & RTCF_LOCAL) ||
 270                     !(IPCB(skb)->flags & IPSKB_FORWARDED))
 271#endif
 272                   ) {
 273                        struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 274                        if (newskb)
 275                                NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
 276                                        newskb, NULL, newskb->dev,
 277                                        ip_dev_loopback_xmit);
 278                }
 279
 280                /* Multicasts with ttl 0 must not go beyond the host */
 281
 282                if (ip_hdr(skb)->ttl == 0) {
 283                        kfree_skb(skb);
 284                        return 0;
 285                }
 286        }
 287
 288        if (rt->rt_flags&RTCF_BROADCAST) {
 289                struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 290                if (newskb)
 291                        NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
 292                                NULL, newskb->dev, ip_dev_loopback_xmit);
 293        }
 294
 295        return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
 296                            skb->dev, ip_finish_output,
 297                            !(IPCB(skb)->flags & IPSKB_REROUTED));
 298}
 299
 300int ip_output(struct sk_buff *skb)
 301{
 302        struct net_device *dev = skb_dst(skb)->dev;
 303
 304        IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
 305
 306        skb->dev = dev;
 307        skb->protocol = htons(ETH_P_IP);
 308
 309        return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
 310                            ip_finish_output,
 311                            !(IPCB(skb)->flags & IPSKB_REROUTED));
 312}
 313
 314int ip_queue_xmit(struct sk_buff *skb)
 315{
 316        struct sock *sk = skb->sk;
 317        struct inet_sock *inet = inet_sk(sk);
 318        struct ip_options *opt = inet->opt;
 319        struct rtable *rt;
 320        struct iphdr *iph;
 321        int res;
 322
 323        /* Skip all of this if the packet is already routed,
 324         * f.e. by something like SCTP.
 325         */
 326        rcu_read_lock();
 327        rt = skb_rtable(skb);
 328        if (rt != NULL)
 329                goto packet_routed;
 330
 331        /* Make sure we can route this packet. */
 332        rt = (struct rtable *)__sk_dst_check(sk, 0);
 333        if (rt == NULL) {
 334                __be32 daddr;
 335
 336                /* Use correct destination address if we have options. */
 337                daddr = inet->inet_daddr;
 338                if(opt && opt->srr)
 339                        daddr = opt->faddr;
 340
 341                {
 342                        struct flowi fl = { .oif = sk->sk_bound_dev_if,
 343                                            .mark = sk->sk_mark,
 344                                            .nl_u = { .ip4_u =
 345                                                      { .daddr = daddr,
 346                                                        .saddr = inet->inet_saddr,
 347                                                        .tos = RT_CONN_FLAGS(sk) } },
 348                                            .proto = sk->sk_protocol,
 349                                            .flags = inet_sk_flowi_flags(sk),
 350                                            .uli_u = { .ports =
 351                                                       { .sport = inet->inet_sport,
 352                                                         .dport = inet->inet_dport } } };
 353
 354                        /* If this fails, retransmit mechanism of transport layer will
 355                         * keep trying until route appears or the connection times
 356                         * itself out.
 357                         */
 358                        security_sk_classify_flow(sk, &fl);
 359                        if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
 360                                goto no_route;
 361                }
 362                sk_setup_caps(sk, &rt->u.dst);
 363        }
 364        skb_dst_set_noref(skb, &rt->u.dst);
 365
 366packet_routed:
 367        if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 368                goto no_route;
 369
 370        /* OK, we know where to send it, allocate and build IP header. */
 371        skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 372        skb_reset_network_header(skb);
 373        iph = ip_hdr(skb);
 374        *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 375        if (ip_dont_fragment(sk, &rt->u.dst) && !skb->local_df)
 376                iph->frag_off = htons(IP_DF);
 377        else
 378                iph->frag_off = 0;
 379        iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 380        iph->protocol = sk->sk_protocol;
 381        iph->saddr    = rt->rt_src;
 382        iph->daddr    = rt->rt_dst;
 383        /* Transport layer set skb->h.foo itself. */
 384
 385        if (opt && opt->optlen) {
 386                iph->ihl += opt->optlen >> 2;
 387                ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
 388        }
 389
 390        ip_select_ident_more(iph, &rt->u.dst, sk,
 391                             (skb_shinfo(skb)->gso_segs ?: 1) - 1);
 392
 393        skb->priority = sk->sk_priority;
 394        skb->mark = sk->sk_mark;
 395
 396        res = ip_local_out(skb);
 397        rcu_read_unlock();
 398        return res;
 399
 400no_route:
 401        rcu_read_unlock();
 402        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 403        kfree_skb(skb);
 404        return -EHOSTUNREACH;
 405}
 406
 407
 408static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 409{
 410        to->pkt_type = from->pkt_type;
 411        to->priority = from->priority;
 412        to->protocol = from->protocol;
 413        skb_dst_drop(to);
 414        skb_dst_set(to, dst_clone(skb_dst(from)));
 415        to->dev = from->dev;
 416        to->mark = from->mark;
 417
 418        /* Copy the flags to each fragment. */
 419        IPCB(to)->flags = IPCB(from)->flags;
 420
 421#ifdef CONFIG_NET_SCHED
 422        to->tc_index = from->tc_index;
 423#endif
 424        nf_copy(to, from);
 425#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 426    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 427        to->nf_trace = from->nf_trace;
 428#endif
 429#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 430        to->ipvs_property = from->ipvs_property;
 431#endif
 432        skb_copy_secmark(to, from);
 433}
 434
 435/*
 436 *      This IP datagram is too large to be sent in one piece.  Break it up into
 437 *      smaller pieces (each of size equal to IP header plus
 438 *      a block of the data of the original IP data part) that will yet fit in a
 439 *      single device frame, and queue such a frame for sending.
 440 */
 441
 442int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 443{
 444        struct iphdr *iph;
 445        int raw = 0;
 446        int ptr;
 447        struct net_device *dev;
 448        struct sk_buff *skb2;
 449        unsigned int mtu, hlen, left, len, ll_rs, pad;
 450        int offset;
 451        __be16 not_last_frag;
 452        struct rtable *rt = skb_rtable(skb);
 453        int err = 0;
 454
 455        dev = rt->u.dst.dev;
 456
 457        /*
 458         *      Point into the IP datagram header.
 459         */
 460
 461        iph = ip_hdr(skb);
 462
 463        if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 464                IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 465                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 466                          htonl(ip_skb_dst_mtu(skb)));
 467                kfree_skb(skb);
 468                return -EMSGSIZE;
 469        }
 470
 471        /*
 472         *      Setup starting values.
 473         */
 474
 475        hlen = iph->ihl * 4;
 476        mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
 477#ifdef CONFIG_BRIDGE_NETFILTER
 478        if (skb->nf_bridge)
 479                mtu -= nf_bridge_mtu_reduction(skb);
 480#endif
 481        IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 482
 483        /* When frag_list is given, use it. First, check its validity:
 484         * some transformers could create wrong frag_list or break existing
 485         * one, it is not prohibited. In this case fall back to copying.
 486         *
 487         * LATER: this step can be merged to real generation of fragments,
 488         * we can switch to copy when see the first bad fragment.
 489         */
 490        if (skb_has_frags(skb)) {
 491                struct sk_buff *frag;
 492                int first_len = skb_pagelen(skb);
 493                int truesizes = 0;
 494
 495                if (first_len - hlen > mtu ||
 496                    ((first_len - hlen) & 7) ||
 497                    (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
 498                    skb_cloned(skb))
 499                        goto slow_path;
 500
 501                skb_walk_frags(skb, frag) {
 502                        /* Correct geometry. */
 503                        if (frag->len > mtu ||
 504                            ((frag->len & 7) && frag->next) ||
 505                            skb_headroom(frag) < hlen)
 506                            goto slow_path;
 507
 508                        /* Partially cloned skb? */
 509                        if (skb_shared(frag))
 510                                goto slow_path;
 511
 512                        BUG_ON(frag->sk);
 513                        if (skb->sk) {
 514                                frag->sk = skb->sk;
 515                                frag->destructor = sock_wfree;
 516                        }
 517                        truesizes += frag->truesize;
 518                }
 519
 520                /* Everything is OK. Generate! */
 521
 522                err = 0;
 523                offset = 0;
 524                frag = skb_shinfo(skb)->frag_list;
 525                skb_frag_list_init(skb);
 526                skb->data_len = first_len - skb_headlen(skb);
 527                skb->truesize -= truesizes;
 528                skb->len = first_len;
 529                iph->tot_len = htons(first_len);
 530                iph->frag_off = htons(IP_MF);
 531                ip_send_check(iph);
 532
 533                for (;;) {
 534                        /* Prepare header of the next frame,
 535                         * before previous one went down. */
 536                        if (frag) {
 537                                frag->ip_summed = CHECKSUM_NONE;
 538                                skb_reset_transport_header(frag);
 539                                __skb_push(frag, hlen);
 540                                skb_reset_network_header(frag);
 541                                memcpy(skb_network_header(frag), iph, hlen);
 542                                iph = ip_hdr(frag);
 543                                iph->tot_len = htons(frag->len);
 544                                ip_copy_metadata(frag, skb);
 545                                if (offset == 0)
 546                                        ip_options_fragment(frag);
 547                                offset += skb->len - hlen;
 548                                iph->frag_off = htons(offset>>3);
 549                                if (frag->next != NULL)
 550                                        iph->frag_off |= htons(IP_MF);
 551                                /* Ready, complete checksum */
 552                                ip_send_check(iph);
 553                        }
 554
 555                        err = output(skb);
 556
 557                        if (!err)
 558                                IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
 559                        if (err || !frag)
 560                                break;
 561
 562                        skb = frag;
 563                        frag = skb->next;
 564                        skb->next = NULL;
 565                }
 566
 567                if (err == 0) {
 568                        IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
 569                        return 0;
 570                }
 571
 572                while (frag) {
 573                        skb = frag->next;
 574                        kfree_skb(frag);
 575                        frag = skb;
 576                }
 577                IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 578                return err;
 579        }
 580
 581slow_path:
 582        left = skb->len - hlen;         /* Space per frame */
 583        ptr = raw + hlen;               /* Where to start from */
 584
 585        /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 586         * we need to make room for the encapsulating header
 587         */
 588        pad = nf_bridge_pad(skb);
 589        ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
 590        mtu -= pad;
 591
 592        /*
 593         *      Fragment the datagram.
 594         */
 595
 596        offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 597        not_last_frag = iph->frag_off & htons(IP_MF);
 598
 599        /*
 600         *      Keep copying data until we run out.
 601         */
 602
 603        while (left > 0) {
 604                len = left;
 605                /* IF: it doesn't fit, use 'mtu' - the data space left */
 606                if (len > mtu)
 607                        len = mtu;
 608                /* IF: we are not sending upto and including the packet end
 609                   then align the next start on an eight byte boundary */
 610                if (len < left) {
 611                        len &= ~7;
 612                }
 613                /*
 614                 *      Allocate buffer.
 615                 */
 616
 617                if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 618                        NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
 619                        err = -ENOMEM;
 620                        goto fail;
 621                }
 622
 623                /*
 624                 *      Set up data on packet
 625                 */
 626
 627                ip_copy_metadata(skb2, skb);
 628                skb_reserve(skb2, ll_rs);
 629                skb_put(skb2, len + hlen);
 630                skb_reset_network_header(skb2);
 631                skb2->transport_header = skb2->network_header + hlen;
 632
 633                /*
 634                 *      Charge the memory for the fragment to any owner
 635                 *      it might possess
 636                 */
 637
 638                if (skb->sk)
 639                        skb_set_owner_w(skb2, skb->sk);
 640
 641                /*
 642                 *      Copy the packet header into the new buffer.
 643                 */
 644
 645                skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
 646
 647                /*
 648                 *      Copy a block of the IP datagram.
 649                 */
 650                if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
 651                        BUG();
 652                left -= len;
 653
 654                /*
 655                 *      Fill in the new header fields.
 656                 */
 657                iph = ip_hdr(skb2);
 658                iph->frag_off = htons((offset >> 3));
 659
 660                /* ANK: dirty, but effective trick. Upgrade options only if
 661                 * the segment to be fragmented was THE FIRST (otherwise,
 662                 * options are already fixed) and make it ONCE
 663                 * on the initial skb, so that all the following fragments
 664                 * will inherit fixed options.
 665                 */
 666                if (offset == 0)
 667                        ip_options_fragment(skb);
 668
 669                /*
 670                 *      Added AC : If we are fragmenting a fragment that's not the
 671                 *                 last fragment then keep MF on each bit
 672                 */
 673                if (left > 0 || not_last_frag)
 674                        iph->frag_off |= htons(IP_MF);
 675                ptr += len;
 676                offset += len;
 677
 678                /*
 679                 *      Put this fragment into the sending queue.
 680                 */
 681                iph->tot_len = htons(len + hlen);
 682
 683                ip_send_check(iph);
 684
 685                err = output(skb2);
 686                if (err)
 687                        goto fail;
 688
 689                IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
 690        }
 691        kfree_skb(skb);
 692        IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
 693        return err;
 694
 695fail:
 696        kfree_skb(skb);
 697        IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 698        return err;
 699}
 700
 701EXPORT_SYMBOL(ip_fragment);
 702
 703int
 704ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 705{
 706        struct iovec *iov = from;
 707
 708        if (skb->ip_summed == CHECKSUM_PARTIAL) {
 709                if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 710                        return -EFAULT;
 711        } else {
 712                __wsum csum = 0;
 713                if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 714                        return -EFAULT;
 715                skb->csum = csum_block_add(skb->csum, csum, odd);
 716        }
 717        return 0;
 718}
 719
 720static inline __wsum
 721csum_page(struct page *page, int offset, int copy)
 722{
 723        char *kaddr;
 724        __wsum csum;
 725        kaddr = kmap(page);
 726        csum = csum_partial(kaddr + offset, copy, 0);
 727        kunmap(page);
 728        return csum;
 729}
 730
 731static inline int ip_ufo_append_data(struct sock *sk,
 732                        int getfrag(void *from, char *to, int offset, int len,
 733                               int odd, struct sk_buff *skb),
 734                        void *from, int length, int hh_len, int fragheaderlen,
 735                        int transhdrlen, int mtu, unsigned int flags)
 736{
 737        struct sk_buff *skb;
 738        int err;
 739
 740        /* There is support for UDP fragmentation offload by network
 741         * device, so create one single skb packet containing complete
 742         * udp datagram
 743         */
 744        if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
 745                skb = sock_alloc_send_skb(sk,
 746                        hh_len + fragheaderlen + transhdrlen + 20,
 747                        (flags & MSG_DONTWAIT), &err);
 748
 749                if (skb == NULL)
 750                        return err;
 751
 752                /* reserve space for Hardware header */
 753                skb_reserve(skb, hh_len);
 754
 755                /* create space for UDP/IP header */
 756                skb_put(skb, fragheaderlen + transhdrlen);
 757
 758                /* initialize network header pointer */
 759                skb_reset_network_header(skb);
 760
 761                /* initialize protocol header pointer */
 762                skb->transport_header = skb->network_header + fragheaderlen;
 763
 764                skb->ip_summed = CHECKSUM_PARTIAL;
 765                skb->csum = 0;
 766                sk->sk_sndmsg_off = 0;
 767
 768                /* specify the length of each IP datagram fragment */
 769                skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
 770                skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
 771                __skb_queue_tail(&sk->sk_write_queue, skb);
 772        }
 773
 774        return skb_append_datato_frags(sk, skb, getfrag, from,
 775                                       (length - transhdrlen));
 776}
 777
 778/*
 779 *      ip_append_data() and ip_append_page() can make one large IP datagram
 780 *      from many pieces of data. Each pieces will be holded on the socket
 781 *      until ip_push_pending_frames() is called. Each piece can be a page
 782 *      or non-page data.
 783 *
 784 *      Not only UDP, other transport protocols - e.g. raw sockets - can use
 785 *      this interface potentially.
 786 *
 787 *      LATER: length must be adjusted by pad at tail, when it is required.
 788 */
 789int ip_append_data(struct sock *sk,
 790                   int getfrag(void *from, char *to, int offset, int len,
 791                               int odd, struct sk_buff *skb),
 792                   void *from, int length, int transhdrlen,
 793                   struct ipcm_cookie *ipc, struct rtable **rtp,
 794                   unsigned int flags)
 795{
 796        struct inet_sock *inet = inet_sk(sk);
 797        struct sk_buff *skb;
 798
 799        struct ip_options *opt = NULL;
 800        int hh_len;
 801        int exthdrlen;
 802        int mtu;
 803        int copy;
 804        int err;
 805        int offset = 0;
 806        unsigned int maxfraglen, fragheaderlen;
 807        int csummode = CHECKSUM_NONE;
 808        struct rtable *rt;
 809
 810        if (flags&MSG_PROBE)
 811                return 0;
 812
 813        if (skb_queue_empty(&sk->sk_write_queue)) {
 814                /*
 815                 * setup for corking.
 816                 */
 817                opt = ipc->opt;
 818                if (opt) {
 819                        if (inet->cork.opt == NULL) {
 820                                inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
 821                                if (unlikely(inet->cork.opt == NULL))
 822                                        return -ENOBUFS;
 823                        }
 824                        memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
 825                        inet->cork.flags |= IPCORK_OPT;
 826                        inet->cork.addr = ipc->addr;
 827                }
 828                rt = *rtp;
 829                if (unlikely(!rt))
 830                        return -EFAULT;
 831                /*
 832                 * We steal reference to this route, caller should not release it
 833                 */
 834                *rtp = NULL;
 835                inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
 836                                            rt->u.dst.dev->mtu :
 837                                            dst_mtu(rt->u.dst.path);
 838                inet->cork.dst = &rt->u.dst;
 839                inet->cork.length = 0;
 840                sk->sk_sndmsg_page = NULL;
 841                sk->sk_sndmsg_off = 0;
 842                if ((exthdrlen = rt->u.dst.header_len) != 0) {
 843                        length += exthdrlen;
 844                        transhdrlen += exthdrlen;
 845                }
 846        } else {
 847                rt = (struct rtable *)inet->cork.dst;
 848                if (inet->cork.flags & IPCORK_OPT)
 849                        opt = inet->cork.opt;
 850
 851                transhdrlen = 0;
 852                exthdrlen = 0;
 853                mtu = inet->cork.fragsize;
 854        }
 855        hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 856
 857        fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 858        maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 859
 860        if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
 861                ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
 862                               mtu-exthdrlen);
 863                return -EMSGSIZE;
 864        }
 865
 866        /*
 867         * transhdrlen > 0 means that this is the first fragment and we wish
 868         * it won't be fragmented in the future.
 869         */
 870        if (transhdrlen &&
 871            length + fragheaderlen <= mtu &&
 872            rt->u.dst.dev->features & NETIF_F_V4_CSUM &&
 873            !exthdrlen)
 874                csummode = CHECKSUM_PARTIAL;
 875
 876        skb = skb_peek_tail(&sk->sk_write_queue);
 877
 878        inet->cork.length += length;
 879        if (((length > mtu) || (skb && skb_is_gso(skb))) &&
 880            (sk->sk_protocol == IPPROTO_UDP) &&
 881            (rt->u.dst.dev->features & NETIF_F_UFO)) {
 882                err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
 883                                         fragheaderlen, transhdrlen, mtu,
 884                                         flags);
 885                if (err)
 886                        goto error;
 887                return 0;
 888        }
 889
 890        /* So, what's going on in the loop below?
 891         *
 892         * We use calculated fragment length to generate chained skb,
 893         * each of segments is IP fragment ready for sending to network after
 894         * adding appropriate IP header.
 895         */
 896
 897        if (!skb)
 898                goto alloc_new_skb;
 899
 900        while (length > 0) {
 901                /* Check if the remaining data fits into current packet. */
 902                copy = mtu - skb->len;
 903                if (copy < length)
 904                        copy = maxfraglen - skb->len;
 905                if (copy <= 0) {
 906                        char *data;
 907                        unsigned int datalen;
 908                        unsigned int fraglen;
 909                        unsigned int fraggap;
 910                        unsigned int alloclen;
 911                        struct sk_buff *skb_prev;
 912alloc_new_skb:
 913                        skb_prev = skb;
 914                        if (skb_prev)
 915                                fraggap = skb_prev->len - maxfraglen;
 916                        else
 917                                fraggap = 0;
 918
 919                        /*
 920                         * If remaining data exceeds the mtu,
 921                         * we know we need more fragment(s).
 922                         */
 923                        datalen = length + fraggap;
 924                        if (datalen > mtu - fragheaderlen)
 925                                datalen = maxfraglen - fragheaderlen;
 926                        fraglen = datalen + fragheaderlen;
 927
 928                        if ((flags & MSG_MORE) &&
 929                            !(rt->u.dst.dev->features&NETIF_F_SG))
 930                                alloclen = mtu;
 931                        else
 932                                alloclen = datalen + fragheaderlen;
 933
 934                        /* The last fragment gets additional space at tail.
 935                         * Note, with MSG_MORE we overallocate on fragments,
 936                         * because we have no idea what fragment will be
 937                         * the last.
 938                         */
 939                        if (datalen == length + fraggap)
 940                                alloclen += rt->u.dst.trailer_len;
 941
 942                        if (transhdrlen) {
 943                                skb = sock_alloc_send_skb(sk,
 944                                                alloclen + hh_len + 15,
 945                                                (flags & MSG_DONTWAIT), &err);
 946                        } else {
 947                                skb = NULL;
 948                                if (atomic_read(&sk->sk_wmem_alloc) <=
 949                                    2 * sk->sk_sndbuf)
 950                                        skb = sock_wmalloc(sk,
 951                                                           alloclen + hh_len + 15, 1,
 952                                                           sk->sk_allocation);
 953                                if (unlikely(skb == NULL))
 954                                        err = -ENOBUFS;
 955                                else
 956                                        /* only the initial fragment is
 957                                           time stamped */
 958                                        ipc->shtx.flags = 0;
 959                        }
 960                        if (skb == NULL)
 961                                goto error;
 962
 963                        /*
 964                         *      Fill in the control structures
 965                         */
 966                        skb->ip_summed = csummode;
 967                        skb->csum = 0;
 968                        skb_reserve(skb, hh_len);
 969                        *skb_tx(skb) = ipc->shtx;
 970
 971                        /*
 972                         *      Find where to start putting bytes.
 973                         */
 974                        data = skb_put(skb, fraglen);
 975                        skb_set_network_header(skb, exthdrlen);
 976                        skb->transport_header = (skb->network_header +
 977                                                 fragheaderlen);
 978                        data += fragheaderlen;
 979
 980                        if (fraggap) {
 981                                skb->csum = skb_copy_and_csum_bits(
 982                                        skb_prev, maxfraglen,
 983                                        data + transhdrlen, fraggap, 0);
 984                                skb_prev->csum = csum_sub(skb_prev->csum,
 985                                                          skb->csum);
 986                                data += fraggap;
 987                                pskb_trim_unique(skb_prev, maxfraglen);
 988                        }
 989
 990                        copy = datalen - transhdrlen - fraggap;
 991                        if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 992                                err = -EFAULT;
 993                                kfree_skb(skb);
 994                                goto error;
 995                        }
 996
 997                        offset += copy;
 998                        length -= datalen - fraggap;
 999                        transhdrlen = 0;
1000                        exthdrlen = 0;
1001                        csummode = CHECKSUM_NONE;
1002
1003                        /*
1004                         * Put the packet on the pending queue.
1005                         */
1006                        __skb_queue_tail(&sk->sk_write_queue, skb);
1007                        continue;
1008                }
1009
1010                if (copy > length)
1011                        copy = length;
1012
1013                if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1014                        unsigned int off;
1015
1016                        off = skb->len;
1017                        if (getfrag(from, skb_put(skb, copy),
1018                                        offset, copy, off, skb) < 0) {
1019                                __skb_trim(skb, off);
1020                                err = -EFAULT;
1021                                goto error;
1022                        }
1023                } else {
1024                        int i = skb_shinfo(skb)->nr_frags;
1025                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1026                        struct page *page = sk->sk_sndmsg_page;
1027                        int off = sk->sk_sndmsg_off;
1028                        unsigned int left;
1029
1030                        if (page && (left = PAGE_SIZE - off) > 0) {
1031                                if (copy >= left)
1032                                        copy = left;
1033                                if (page != frag->page) {
1034                                        if (i == MAX_SKB_FRAGS) {
1035                                                err = -EMSGSIZE;
1036                                                goto error;
1037                                        }
1038                                        get_page(page);
1039                                        skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1040                                        frag = &skb_shinfo(skb)->frags[i];
1041                                }
1042                        } else if (i < MAX_SKB_FRAGS) {
1043                                if (copy > PAGE_SIZE)
1044                                        copy = PAGE_SIZE;
1045                                page = alloc_pages(sk->sk_allocation, 0);
1046                                if (page == NULL)  {
1047                                        err = -ENOMEM;
1048                                        goto error;
1049                                }
1050                                sk->sk_sndmsg_page = page;
1051                                sk->sk_sndmsg_off = 0;
1052
1053                                skb_fill_page_desc(skb, i, page, 0, 0);
1054                                frag = &skb_shinfo(skb)->frags[i];
1055                        } else {
1056                                err = -EMSGSIZE;
1057                                goto error;
1058                        }
1059                        if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1060                                err = -EFAULT;
1061                                goto error;
1062                        }
1063                        sk->sk_sndmsg_off += copy;
1064                        frag->size += copy;
1065                        skb->len += copy;
1066                        skb->data_len += copy;
1067                        skb->truesize += copy;
1068                        atomic_add(copy, &sk->sk_wmem_alloc);
1069                }
1070                offset += copy;
1071                length -= copy;
1072        }
1073
1074        return 0;
1075
1076error:
1077        inet->cork.length -= length;
1078        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1079        return err;
1080}
1081
1082ssize_t ip_append_page(struct sock *sk, struct page *page,
1083                       int offset, size_t size, int flags)
1084{
1085        struct inet_sock *inet = inet_sk(sk);
1086        struct sk_buff *skb;
1087        struct rtable *rt;
1088        struct ip_options *opt = NULL;
1089        int hh_len;
1090        int mtu;
1091        int len;
1092        int err;
1093        unsigned int maxfraglen, fragheaderlen, fraggap;
1094
1095        if (inet->hdrincl)
1096                return -EPERM;
1097
1098        if (flags&MSG_PROBE)
1099                return 0;
1100
1101        if (skb_queue_empty(&sk->sk_write_queue))
1102                return -EINVAL;
1103
1104        rt = (struct rtable *)inet->cork.dst;
1105        if (inet->cork.flags & IPCORK_OPT)
1106                opt = inet->cork.opt;
1107
1108        if (!(rt->u.dst.dev->features&NETIF_F_SG))
1109                return -EOPNOTSUPP;
1110
1111        hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1112        mtu = inet->cork.fragsize;
1113
1114        fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1115        maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1116
1117        if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1118                ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
1119                return -EMSGSIZE;
1120        }
1121
1122        if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1123                return -EINVAL;
1124
1125        inet->cork.length += size;
1126        if ((size + skb->len > mtu) &&
1127            (sk->sk_protocol == IPPROTO_UDP) &&
1128            (rt->u.dst.dev->features & NETIF_F_UFO)) {
1129                skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1130                skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1131        }
1132
1133
1134        while (size > 0) {
1135                int i;
1136
1137                if (skb_is_gso(skb))
1138                        len = size;
1139                else {
1140
1141                        /* Check if the remaining data fits into current packet. */
1142                        len = mtu - skb->len;
1143                        if (len < size)
1144                                len = maxfraglen - skb->len;
1145                }
1146                if (len <= 0) {
1147                        struct sk_buff *skb_prev;
1148                        int alloclen;
1149
1150                        skb_prev = skb;
1151                        fraggap = skb_prev->len - maxfraglen;
1152
1153                        alloclen = fragheaderlen + hh_len + fraggap + 15;
1154                        skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1155                        if (unlikely(!skb)) {
1156                                err = -ENOBUFS;
1157                                goto error;
1158                        }
1159
1160                        /*
1161                         *      Fill in the control structures
1162                         */
1163                        skb->ip_summed = CHECKSUM_NONE;
1164                        skb->csum = 0;
1165                        skb_reserve(skb, hh_len);
1166
1167                        /*
1168                         *      Find where to start putting bytes.
1169                         */
1170                        skb_put(skb, fragheaderlen + fraggap);
1171                        skb_reset_network_header(skb);
1172                        skb->transport_header = (skb->network_header +
1173                                                 fragheaderlen);
1174                        if (fraggap) {
1175                                skb->csum = skb_copy_and_csum_bits(skb_prev,
1176                                                                   maxfraglen,
1177                                                    skb_transport_header(skb),
1178                                                                   fraggap, 0);
1179                                skb_prev->csum = csum_sub(skb_prev->csum,
1180                                                          skb->csum);
1181                                pskb_trim_unique(skb_prev, maxfraglen);
1182                        }
1183
1184                        /*
1185                         * Put the packet on the pending queue.
1186                         */
1187                        __skb_queue_tail(&sk->sk_write_queue, skb);
1188                        continue;
1189                }
1190
1191                i = skb_shinfo(skb)->nr_frags;
1192                if (len > size)
1193                        len = size;
1194                if (skb_can_coalesce(skb, i, page, offset)) {
1195                        skb_shinfo(skb)->frags[i-1].size += len;
1196                } else if (i < MAX_SKB_FRAGS) {
1197                        get_page(page);
1198                        skb_fill_page_desc(skb, i, page, offset, len);
1199                } else {
1200                        err = -EMSGSIZE;
1201                        goto error;
1202                }
1203
1204                if (skb->ip_summed == CHECKSUM_NONE) {
1205                        __wsum csum;
1206                        csum = csum_page(page, offset, len);
1207                        skb->csum = csum_block_add(skb->csum, csum, skb->len);
1208                }
1209
1210                skb->len += len;
1211                skb->data_len += len;
1212                skb->truesize += len;
1213                atomic_add(len, &sk->sk_wmem_alloc);
1214                offset += len;
1215                size -= len;
1216        }
1217        return 0;
1218
1219error:
1220        inet->cork.length -= size;
1221        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1222        return err;
1223}
1224
1225static void ip_cork_release(struct inet_sock *inet)
1226{
1227        inet->cork.flags &= ~IPCORK_OPT;
1228        kfree(inet->cork.opt);
1229        inet->cork.opt = NULL;
1230        dst_release(inet->cork.dst);
1231        inet->cork.dst = NULL;
1232}
1233
1234/*
1235 *      Combined all pending IP fragments on the socket as one IP datagram
1236 *      and push them out.
1237 */
1238int ip_push_pending_frames(struct sock *sk)
1239{
1240        struct sk_buff *skb, *tmp_skb;
1241        struct sk_buff **tail_skb;
1242        struct inet_sock *inet = inet_sk(sk);
1243        struct net *net = sock_net(sk);
1244        struct ip_options *opt = NULL;
1245        struct rtable *rt = (struct rtable *)inet->cork.dst;
1246        struct iphdr *iph;
1247        __be16 df = 0;
1248        __u8 ttl;
1249        int err = 0;
1250
1251        if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1252                goto out;
1253        tail_skb = &(skb_shinfo(skb)->frag_list);
1254
1255        /* move skb->data to ip header from ext header */
1256        if (skb->data < skb_network_header(skb))
1257                __skb_pull(skb, skb_network_offset(skb));
1258        while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1259                __skb_pull(tmp_skb, skb_network_header_len(skb));
1260                *tail_skb = tmp_skb;
1261                tail_skb = &(tmp_skb->next);
1262                skb->len += tmp_skb->len;
1263                skb->data_len += tmp_skb->len;
1264                skb->truesize += tmp_skb->truesize;
1265                tmp_skb->destructor = NULL;
1266                tmp_skb->sk = NULL;
1267        }
1268
1269        /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1270         * to fragment the frame generated here. No matter, what transforms
1271         * how transforms change size of the packet, it will come out.
1272         */
1273        if (inet->pmtudisc < IP_PMTUDISC_DO)
1274                skb->local_df = 1;
1275
1276        /* DF bit is set when we want to see DF on outgoing frames.
1277         * If local_df is set too, we still allow to fragment this frame
1278         * locally. */
1279        if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1280            (skb->len <= dst_mtu(&rt->u.dst) &&
1281             ip_dont_fragment(sk, &rt->u.dst)))
1282                df = htons(IP_DF);
1283
1284        if (inet->cork.flags & IPCORK_OPT)
1285                opt = inet->cork.opt;
1286
1287        if (rt->rt_type == RTN_MULTICAST)
1288                ttl = inet->mc_ttl;
1289        else
1290                ttl = ip_select_ttl(inet, &rt->u.dst);
1291
1292        iph = (struct iphdr *)skb->data;
1293        iph->version = 4;
1294        iph->ihl = 5;
1295        if (opt) {
1296                iph->ihl += opt->optlen>>2;
1297                ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1298        }
1299        iph->tos = inet->tos;
1300        iph->frag_off = df;
1301        ip_select_ident(iph, &rt->u.dst, sk);
1302        iph->ttl = ttl;
1303        iph->protocol = sk->sk_protocol;
1304        iph->saddr = rt->rt_src;
1305        iph->daddr = rt->rt_dst;
1306
1307        skb->priority = sk->sk_priority;
1308        skb->mark = sk->sk_mark;
1309        /*
1310         * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1311         * on dst refcount
1312         */
1313        inet->cork.dst = NULL;
1314        skb_dst_set(skb, &rt->u.dst);
1315
1316        if (iph->protocol == IPPROTO_ICMP)
1317                icmp_out_count(net, ((struct icmphdr *)
1318                        skb_transport_header(skb))->type);
1319
1320        /* Netfilter gets whole the not fragmented skb. */
1321        err = ip_local_out(skb);
1322        if (err) {
1323                if (err > 0)
1324                        err = net_xmit_errno(err);
1325                if (err)
1326                        goto error;
1327        }
1328
1329out:
1330        ip_cork_release(inet);
1331        return err;
1332
1333error:
1334        IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1335        goto out;
1336}
1337
1338/*
1339 *      Throw away all pending data on the socket.
1340 */
1341void ip_flush_pending_frames(struct sock *sk)
1342{
1343        struct sk_buff *skb;
1344
1345        while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1346                kfree_skb(skb);
1347
1348        ip_cork_release(inet_sk(sk));
1349}
1350
1351
1352/*
1353 *      Fetch data from kernel space and fill in checksum if needed.
1354 */
1355static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1356                              int len, int odd, struct sk_buff *skb)
1357{
1358        __wsum csum;
1359
1360        csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1361        skb->csum = csum_block_add(skb->csum, csum, odd);
1362        return 0;
1363}
1364
1365/*
1366 *      Generic function to send a packet as reply to another packet.
1367 *      Used to send TCP resets so far. ICMP should use this function too.
1368 *
1369 *      Should run single threaded per socket because it uses the sock
1370 *      structure to pass arguments.
1371 */
1372void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1373                   unsigned int len)
1374{
1375        struct inet_sock *inet = inet_sk(sk);
1376        struct {
1377                struct ip_options       opt;
1378                char                    data[40];
1379        } replyopts;
1380        struct ipcm_cookie ipc;
1381        __be32 daddr;
1382        struct rtable *rt = skb_rtable(skb);
1383
1384        if (ip_options_echo(&replyopts.opt, skb))
1385                return;
1386
1387        daddr = ipc.addr = rt->rt_src;
1388        ipc.opt = NULL;
1389        ipc.shtx.flags = 0;
1390
1391        if (replyopts.opt.optlen) {
1392                ipc.opt = &replyopts.opt;
1393
1394                if (ipc.opt->srr)
1395                        daddr = replyopts.opt.faddr;
1396        }
1397
1398        {
1399                struct flowi fl = { .oif = arg->bound_dev_if,
1400                                    .nl_u = { .ip4_u =
1401                                              { .daddr = daddr,
1402                                                .saddr = rt->rt_spec_dst,
1403                                                .tos = RT_TOS(ip_hdr(skb)->tos) } },
1404                                    /* Not quite clean, but right. */
1405                                    .uli_u = { .ports =
1406                                               { .sport = tcp_hdr(skb)->dest,
1407                                                 .dport = tcp_hdr(skb)->source } },
1408                                    .proto = sk->sk_protocol,
1409                                    .flags = ip_reply_arg_flowi_flags(arg) };
1410                security_skb_classify_flow(skb, &fl);
1411                if (ip_route_output_key(sock_net(sk), &rt, &fl))
1412                        return;
1413        }
1414
1415        /* And let IP do all the hard work.
1416
1417           This chunk is not reenterable, hence spinlock.
1418           Note that it uses the fact, that this function is called
1419           with locally disabled BH and that sk cannot be already spinlocked.
1420         */
1421        bh_lock_sock(sk);
1422        inet->tos = ip_hdr(skb)->tos;
1423        sk->sk_priority = skb->priority;
1424        sk->sk_protocol = ip_hdr(skb)->protocol;
1425        sk->sk_bound_dev_if = arg->bound_dev_if;
1426        ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1427                       &ipc, &rt, MSG_DONTWAIT);
1428        if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1429                if (arg->csumoffset >= 0)
1430                        *((__sum16 *)skb_transport_header(skb) +
1431                          arg->csumoffset) = csum_fold(csum_add(skb->csum,
1432                                                                arg->csum));
1433                skb->ip_summed = CHECKSUM_NONE;
1434                ip_push_pending_frames(sk);
1435        }
1436
1437        bh_unlock_sock(sk);
1438
1439        ip_rt_put(rt);
1440}
1441
1442void __init ip_init(void)
1443{
1444        ip_rt_init();
1445        inet_initpeers();
1446
1447#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1448        igmp_mc_proc_init();
1449#endif
1450}
1451
1452EXPORT_SYMBOL(ip_generic_getfrag);
1453EXPORT_SYMBOL(ip_queue_xmit);
1454EXPORT_SYMBOL(ip_send_check);
1455
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.