linux/net/ipv4/ip_output.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              The Internet Protocol (IP) output module.
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Donald Becker, <becker@super.org>
  11 *              Alan Cox, <Alan.Cox@linux.org>
  12 *              Richard Underwood
  13 *              Stefan Becker, <stefanb@yello.ping.de>
  14 *              Jorge Cwik, <jorge@laser.satlink.net>
  15 *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  16 *              Hirokazu Takahashi, <taka@valinux.co.jp>
  17 *
  18 *      See ip_input.c for original log
  19 *
  20 *      Fixes:
  21 *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  22 *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  23 *              Bradford Johnson:       Fix faulty handling of some frames when
  24 *                                      no route is found.
  25 *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  26 *                                      (in case if packet not accepted by
  27 *                                      output firewall rules)
  28 *              Mike McLagan    :       Routing by source
  29 *              Alexey Kuznetsov:       use new route cache
  30 *              Andi Kleen:             Fix broken PMTU recovery and remove
  31 *                                      some redundant tests.
  32 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  33 *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  34 *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  35 *                                      for decreased register pressure on x86
  36 *                                      and more readibility.
  37 *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  38 *                                      silently drop skb instead of failing with -EPERM.
  39 *              Detlev Wengorz  :       Copy protocol for fragments.
  40 *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  41 *                                      datagrams.
  42 *              Hirokazu Takahashi:     sendfile() on UDP works now.
  43 */
  44
  45#include <asm/uaccess.h>
  46#include <asm/system.h>
  47#include <linux/module.h>
  48#include <linux/types.h>
  49#include <linux/kernel.h>
  50#include <linux/mm.h>
  51#include <linux/string.h>
  52#include <linux/errno.h>
  53#include <linux/highmem.h>
  54
  55#include <linux/socket.h>
  56#include <linux/sockios.h>
  57#include <linux/in.h>
  58#include <linux/inet.h>
  59#include <linux/netdevice.h>
  60#include <linux/etherdevice.h>
  61#include <linux/proc_fs.h>
  62#include <linux/stat.h>
  63#include <linux/init.h>
  64
  65#include <net/snmp.h>
  66#include <net/ip.h>
  67#include <net/protocol.h>
  68#include <net/route.h>
  69#include <net/xfrm.h>
  70#include <linux/skbuff.h>
  71#include <net/sock.h>
  72#include <net/arp.h>
  73#include <net/icmp.h>
  74#include <net/checksum.h>
  75#include <net/inetpeer.h>
  76#include <linux/igmp.h>
  77#include <linux/netfilter_ipv4.h>
  78#include <linux/netfilter_bridge.h>
  79#include <linux/mroute.h>
  80#include <linux/netlink.h>
  81#include <linux/tcp.h>
  82
  83int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
  84
  85/* Generate a checksum for an outgoing IP datagram. */
  86__inline__ void ip_send_check(struct iphdr *iph)
  87{
  88        iph->check = 0;
  89        iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  90}
  91
  92int __ip_local_out(struct sk_buff *skb)
  93{
  94        struct iphdr *iph = ip_hdr(skb);
  95
  96        iph->tot_len = htons(skb->len);
  97        ip_send_check(iph);
  98        return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
  99                       dst_output);
 100}
 101
 102int ip_local_out(struct sk_buff *skb)
 103{
 104        int err;
 105
 106        err = __ip_local_out(skb);
 107        if (likely(err == 1))
 108                err = dst_output(skb);
 109
 110        return err;
 111}
 112EXPORT_SYMBOL_GPL(ip_local_out);
 113
 114/* dev_loopback_xmit for use with netfilter. */
 115static int ip_dev_loopback_xmit(struct sk_buff *newskb)
 116{
 117        skb_reset_mac_header(newskb);
 118        __skb_pull(newskb, skb_network_offset(newskb));
 119        newskb->pkt_type = PACKET_LOOPBACK;
 120        newskb->ip_summed = CHECKSUM_UNNECESSARY;
 121        WARN_ON(!skb_dst(newskb));
 122        netif_rx(newskb);
 123        return 0;
 124}
 125
 126static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 127{
 128        int ttl = inet->uc_ttl;
 129
 130        if (ttl < 0)
 131                ttl = dst_metric(dst, RTAX_HOPLIMIT);
 132        return ttl;
 133}
 134
 135/*
 136 *              Add an ip header to a skbuff and send it out.
 137 *
 138 */
 139int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 140                          __be32 saddr, __be32 daddr, struct ip_options *opt)
 141{
 142        struct inet_sock *inet = inet_sk(sk);
 143        struct rtable *rt = skb_rtable(skb);
 144        struct iphdr *iph;
 145
 146        /* Build the IP header. */
 147        skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 148        skb_reset_network_header(skb);
 149        iph = ip_hdr(skb);
 150        iph->version  = 4;
 151        iph->ihl      = 5;
 152        iph->tos      = inet->tos;
 153        if (ip_dont_fragment(sk, &rt->u.dst))
 154                iph->frag_off = htons(IP_DF);
 155        else
 156                iph->frag_off = 0;
 157        iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 158        iph->daddr    = rt->rt_dst;
 159        iph->saddr    = rt->rt_src;
 160        iph->protocol = sk->sk_protocol;
 161        ip_select_ident(iph, &rt->u.dst, sk);
 162
 163        if (opt && opt->optlen) {
 164                iph->ihl += opt->optlen>>2;
 165                ip_options_build(skb, opt, daddr, rt, 0);
 166        }
 167
 168        skb->priority = sk->sk_priority;
 169        skb->mark = sk->sk_mark;
 170
 171        /* Send it out. */
 172        return ip_local_out(skb);
 173}
 174
 175EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 176
 177static inline int ip_finish_output2(struct sk_buff *skb)
 178{
 179        struct dst_entry *dst = skb_dst(skb);
 180        struct rtable *rt = (struct rtable *)dst;
 181        struct net_device *dev = dst->dev;
 182        unsigned int hh_len = LL_RESERVED_SPACE(dev);
 183
 184        if (rt->rt_type == RTN_MULTICAST) {
 185                IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
 186        } else if (rt->rt_type == RTN_BROADCAST)
 187                IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
 188
 189        /* Be paranoid, rather than too clever. */
 190        if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
 191                struct sk_buff *skb2;
 192
 193                skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 194                if (skb2 == NULL) {
 195                        kfree_skb(skb);
 196                        return -ENOMEM;
 197                }
 198                if (skb->sk)
 199                        skb_set_owner_w(skb2, skb->sk);
 200                kfree_skb(skb);
 201                skb = skb2;
 202        }
 203
 204        if (dst->hh)
 205                return neigh_hh_output(dst->hh, skb);
 206        else if (dst->neighbour)
 207                return dst->neighbour->output(skb);
 208
 209        if (net_ratelimit())
 210                printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 211        kfree_skb(skb);
 212        return -EINVAL;
 213}
 214
 215static inline int ip_skb_dst_mtu(struct sk_buff *skb)
 216{
 217        struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
 218
 219        return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
 220               skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
 221}
 222
 223static int ip_finish_output(struct sk_buff *skb)
 224{
 225#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 226        /* Policy lookup after SNAT yielded a new policy */
 227        if (skb_dst(skb)->xfrm != NULL) {
 228                IPCB(skb)->flags |= IPSKB_REROUTED;
 229                return dst_output(skb);
 230        }
 231#endif
 232        if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
 233                return ip_fragment(skb, ip_finish_output2);
 234        else
 235                return ip_finish_output2(skb);
 236}
 237
 238int ip_mc_output(struct sk_buff *skb)
 239{
 240        struct sock *sk = skb->sk;
 241        struct rtable *rt = skb_rtable(skb);
 242        struct net_device *dev = rt->u.dst.dev;
 243
 244        /*
 245         *      If the indicated interface is up and running, send the packet.
 246         */
 247        IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
 248
 249        skb->dev = dev;
 250        skb->protocol = htons(ETH_P_IP);
 251
 252        /*
 253         *      Multicasts are looped back for other local users
 254         */
 255
 256        if (rt->rt_flags&RTCF_MULTICAST) {
 257                if ((!sk || inet_sk(sk)->mc_loop)
 258#ifdef CONFIG_IP_MROUTE
 259                /* Small optimization: do not loopback not local frames,
 260                   which returned after forwarding; they will be  dropped
 261                   by ip_mr_input in any case.
 262                   Note, that local frames are looped back to be delivered
 263                   to local recipients.
 264
 265                   This check is duplicated in ip_mr_input at the moment.
 266                 */
 267                    && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 268#endif
 269                ) {
 270                        struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 271                        if (newskb)
 272                                NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb,
 273                                        NULL, newskb->dev,
 274                                        ip_dev_loopback_xmit);
 275                }
 276
 277                /* Multicasts with ttl 0 must not go beyond the host */
 278
 279                if (ip_hdr(skb)->ttl == 0) {
 280                        kfree_skb(skb);
 281                        return 0;
 282                }
 283        }
 284
 285        if (rt->rt_flags&RTCF_BROADCAST) {
 286                struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 287                if (newskb)
 288                        NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, NULL,
 289                                newskb->dev, ip_dev_loopback_xmit);
 290        }
 291
 292        return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
 293                            ip_finish_output,
 294                            !(IPCB(skb)->flags & IPSKB_REROUTED));
 295}
 296
 297int ip_output(struct sk_buff *skb)
 298{
 299        struct net_device *dev = skb_dst(skb)->dev;
 300
 301        IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
 302
 303        skb->dev = dev;
 304        skb->protocol = htons(ETH_P_IP);
 305
 306        return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev,
 307                            ip_finish_output,
 308                            !(IPCB(skb)->flags & IPSKB_REROUTED));
 309}
 310
 311int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 312{
 313        struct sock *sk = skb->sk;
 314        struct inet_sock *inet = inet_sk(sk);
 315        struct ip_options *opt = inet->opt;
 316        struct rtable *rt;
 317        struct iphdr *iph;
 318
 319        /* Skip all of this if the packet is already routed,
 320         * f.e. by something like SCTP.
 321         */
 322        rt = skb_rtable(skb);
 323        if (rt != NULL)
 324                goto packet_routed;
 325
 326        /* Make sure we can route this packet. */
 327        rt = (struct rtable *)__sk_dst_check(sk, 0);
 328        if (rt == NULL) {
 329                __be32 daddr;
 330
 331                /* Use correct destination address if we have options. */
 332                daddr = inet->daddr;
 333                if(opt && opt->srr)
 334                        daddr = opt->faddr;
 335
 336                {
 337                        struct flowi fl = { .oif = sk->sk_bound_dev_if,
 338                                            .mark = sk->sk_mark,
 339                                            .nl_u = { .ip4_u =
 340                                                      { .daddr = daddr,
 341                                                        .saddr = inet->saddr,
 342                                                        .tos = RT_CONN_FLAGS(sk) } },
 343                                            .proto = sk->sk_protocol,
 344                                            .flags = inet_sk_flowi_flags(sk),
 345                                            .uli_u = { .ports =
 346                                                       { .sport = inet->sport,
 347                                                         .dport = inet->dport } } };
 348
 349                        /* If this fails, retransmit mechanism of transport layer will
 350                         * keep trying until route appears or the connection times
 351                         * itself out.
 352                         */
 353                        security_sk_classify_flow(sk, &fl);
 354                        if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
 355                                goto no_route;
 356                }
 357                sk_setup_caps(sk, &rt->u.dst);
 358        }
 359        skb_dst_set(skb, dst_clone(&rt->u.dst));
 360
 361packet_routed:
 362        if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 363                goto no_route;
 364
 365        /* OK, we know where to send it, allocate and build IP header. */
 366        skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 367        skb_reset_network_header(skb);
 368        iph = ip_hdr(skb);
 369        *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 370        if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
 371                iph->frag_off = htons(IP_DF);
 372        else
 373                iph->frag_off = 0;
 374        iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 375        iph->protocol = sk->sk_protocol;
 376        iph->saddr    = rt->rt_src;
 377        iph->daddr    = rt->rt_dst;
 378        /* Transport layer set skb->h.foo itself. */
 379
 380        if (opt && opt->optlen) {
 381                iph->ihl += opt->optlen >> 2;
 382                ip_options_build(skb, opt, inet->daddr, rt, 0);
 383        }
 384
 385        ip_select_ident_more(iph, &rt->u.dst, sk,
 386                             (skb_shinfo(skb)->gso_segs ?: 1) - 1);
 387
 388        skb->priority = sk->sk_priority;
 389        skb->mark = sk->sk_mark;
 390
 391        return ip_local_out(skb);
 392
 393no_route:
 394        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 395        kfree_skb(skb);
 396        return -EHOSTUNREACH;
 397}
 398
 399
 400static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 401{
 402        to->pkt_type = from->pkt_type;
 403        to->priority = from->priority;
 404        to->protocol = from->protocol;
 405        skb_dst_drop(to);
 406        skb_dst_set(to, dst_clone(skb_dst(from)));
 407        to->dev = from->dev;
 408        to->mark = from->mark;
 409
 410        /* Copy the flags to each fragment. */
 411        IPCB(to)->flags = IPCB(from)->flags;
 412
 413#ifdef CONFIG_NET_SCHED
 414        to->tc_index = from->tc_index;
 415#endif
 416        nf_copy(to, from);
 417#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 418    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 419        to->nf_trace = from->nf_trace;
 420#endif
 421#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 422        to->ipvs_property = from->ipvs_property;
 423#endif
 424        skb_copy_secmark(to, from);
 425}
 426
 427/*
 428 *      This IP datagram is too large to be sent in one piece.  Break it up into
 429 *      smaller pieces (each of size equal to IP header plus
 430 *      a block of the data of the original IP data part) that will yet fit in a
 431 *      single device frame, and queue such a frame for sending.
 432 */
 433
 434int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 435{
 436        struct iphdr *iph;
 437        int raw = 0;
 438        int ptr;
 439        struct net_device *dev;
 440        struct sk_buff *skb2;
 441        unsigned int mtu, hlen, left, len, ll_rs, pad;
 442        int offset;
 443        __be16 not_last_frag;
 444        struct rtable *rt = skb_rtable(skb);
 445        int err = 0;
 446
 447        dev = rt->u.dst.dev;
 448
 449        /*
 450         *      Point into the IP datagram header.
 451         */
 452
 453        iph = ip_hdr(skb);
 454
 455        if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 456                IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 457                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 458                          htonl(ip_skb_dst_mtu(skb)));
 459                kfree_skb(skb);
 460                return -EMSGSIZE;
 461        }
 462
 463        /*
 464         *      Setup starting values.
 465         */
 466
 467        hlen = iph->ihl * 4;
 468        mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
 469        IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 470
 471        /* When frag_list is given, use it. First, check its validity:
 472         * some transformers could create wrong frag_list or break existing
 473         * one, it is not prohibited. In this case fall back to copying.
 474         *
 475         * LATER: this step can be merged to real generation of fragments,
 476         * we can switch to copy when see the first bad fragment.
 477         */
 478        if (skb_has_frags(skb)) {
 479                struct sk_buff *frag;
 480                int first_len = skb_pagelen(skb);
 481                int truesizes = 0;
 482
 483                if (first_len - hlen > mtu ||
 484                    ((first_len - hlen) & 7) ||
 485                    (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
 486                    skb_cloned(skb))
 487                        goto slow_path;
 488
 489                skb_walk_frags(skb, frag) {
 490                        /* Correct geometry. */
 491                        if (frag->len > mtu ||
 492                            ((frag->len & 7) && frag->next) ||
 493                            skb_headroom(frag) < hlen)
 494                            goto slow_path;
 495
 496                        /* Partially cloned skb? */
 497                        if (skb_shared(frag))
 498                                goto slow_path;
 499
 500                        BUG_ON(frag->sk);
 501                        if (skb->sk) {
 502                                frag->sk = skb->sk;
 503                                frag->destructor = sock_wfree;
 504                                truesizes += frag->truesize;
 505                        }
 506                }
 507
 508                /* Everything is OK. Generate! */
 509
 510                err = 0;
 511                offset = 0;
 512                frag = skb_shinfo(skb)->frag_list;
 513                skb_frag_list_init(skb);
 514                skb->data_len = first_len - skb_headlen(skb);
 515                skb->truesize -= truesizes;
 516                skb->len = first_len;
 517                iph->tot_len = htons(first_len);
 518                iph->frag_off = htons(IP_MF);
 519                ip_send_check(iph);
 520
 521                for (;;) {
 522                        /* Prepare header of the next frame,
 523                         * before previous one went down. */
 524                        if (frag) {
 525                                frag->ip_summed = CHECKSUM_NONE;
 526                                skb_reset_transport_header(frag);
 527                                __skb_push(frag, hlen);
 528                                skb_reset_network_header(frag);
 529                                memcpy(skb_network_header(frag), iph, hlen);
 530                                iph = ip_hdr(frag);
 531                                iph->tot_len = htons(frag->len);
 532                                ip_copy_metadata(frag, skb);
 533                                if (offset == 0)
 534                                        ip_options_fragment(frag);
 535                                offset += skb->len - hlen;
 536                                iph->frag_off = htons(offset>>3);
 537                                if (frag->next != NULL)
 538                                        iph->frag_off |= htons(IP_MF);
 539                                /* Ready, complete checksum */
 540                                ip_send_check(iph);
 541                        }
 542
 543                        err = output(skb);
 544
 545                        if (!err)
 546                                IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
 547                        if (err || !frag)
 548                                break;
 549
 550                        skb = frag;
 551                        frag = skb->next;
 552                        skb->next = NULL;
 553                }
 554
 555                if (err == 0) {
 556                        IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
 557                        return 0;
 558                }
 559
 560                while (frag) {
 561                        skb = frag->next;
 562                        kfree_skb(frag);
 563                        frag = skb;
 564                }
 565                IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 566                return err;
 567        }
 568
 569slow_path:
 570        left = skb->len - hlen;         /* Space per frame */
 571        ptr = raw + hlen;               /* Where to start from */
 572
 573        /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 574         * we need to make room for the encapsulating header
 575         */
 576        pad = nf_bridge_pad(skb);
 577        ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
 578        mtu -= pad;
 579
 580        /*
 581         *      Fragment the datagram.
 582         */
 583
 584        offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 585        not_last_frag = iph->frag_off & htons(IP_MF);
 586
 587        /*
 588         *      Keep copying data until we run out.
 589         */
 590
 591        while (left > 0) {
 592                len = left;
 593                /* IF: it doesn't fit, use 'mtu' - the data space left */
 594                if (len > mtu)
 595                        len = mtu;
 596                /* IF: we are not sending upto and including the packet end
 597                   then align the next start on an eight byte boundary */
 598                if (len < left) {
 599                        len &= ~7;
 600                }
 601                /*
 602                 *      Allocate buffer.
 603                 */
 604
 605                if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 606                        NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
 607                        err = -ENOMEM;
 608                        goto fail;
 609                }
 610
 611                /*
 612                 *      Set up data on packet
 613                 */
 614
 615                ip_copy_metadata(skb2, skb);
 616                skb_reserve(skb2, ll_rs);
 617                skb_put(skb2, len + hlen);
 618                skb_reset_network_header(skb2);
 619                skb2->transport_header = skb2->network_header + hlen;
 620
 621                /*
 622                 *      Charge the memory for the fragment to any owner
 623                 *      it might possess
 624                 */
 625
 626                if (skb->sk)
 627                        skb_set_owner_w(skb2, skb->sk);
 628
 629                /*
 630                 *      Copy the packet header into the new buffer.
 631                 */
 632
 633                skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
 634
 635                /*
 636                 *      Copy a block of the IP datagram.
 637                 */
 638                if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
 639                        BUG();
 640                left -= len;
 641
 642                /*
 643                 *      Fill in the new header fields.
 644                 */
 645                iph = ip_hdr(skb2);
 646                iph->frag_off = htons((offset >> 3));
 647
 648                /* ANK: dirty, but effective trick. Upgrade options only if
 649                 * the segment to be fragmented was THE FIRST (otherwise,
 650                 * options are already fixed) and make it ONCE
 651                 * on the initial skb, so that all the following fragments
 652                 * will inherit fixed options.
 653                 */
 654                if (offset == 0)
 655                        ip_options_fragment(skb);
 656
 657                /*
 658                 *      Added AC : If we are fragmenting a fragment that's not the
 659                 *                 last fragment then keep MF on each bit
 660                 */
 661                if (left > 0 || not_last_frag)
 662                        iph->frag_off |= htons(IP_MF);
 663                ptr += len;
 664                offset += len;
 665
 666                /*
 667                 *      Put this fragment into the sending queue.
 668                 */
 669                iph->tot_len = htons(len + hlen);
 670
 671                ip_send_check(iph);
 672
 673                err = output(skb2);
 674                if (err)
 675                        goto fail;
 676
 677                IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
 678        }
 679        kfree_skb(skb);
 680        IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
 681        return err;
 682
 683fail:
 684        kfree_skb(skb);
 685        IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 686        return err;
 687}
 688
 689EXPORT_SYMBOL(ip_fragment);
 690
 691int
 692ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 693{
 694        struct iovec *iov = from;
 695
 696        if (skb->ip_summed == CHECKSUM_PARTIAL) {
 697                if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 698                        return -EFAULT;
 699        } else {
 700                __wsum csum = 0;
 701                if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 702                        return -EFAULT;
 703                skb->csum = csum_block_add(skb->csum, csum, odd);
 704        }
 705        return 0;
 706}
 707
 708static inline __wsum
 709csum_page(struct page *page, int offset, int copy)
 710{
 711        char *kaddr;
 712        __wsum csum;
 713        kaddr = kmap(page);
 714        csum = csum_partial(kaddr + offset, copy, 0);
 715        kunmap(page);
 716        return csum;
 717}
 718
 719static inline int ip_ufo_append_data(struct sock *sk,
 720                        int getfrag(void *from, char *to, int offset, int len,
 721                               int odd, struct sk_buff *skb),
 722                        void *from, int length, int hh_len, int fragheaderlen,
 723                        int transhdrlen, int mtu, unsigned int flags)
 724{
 725        struct sk_buff *skb;
 726        int err;
 727
 728        /* There is support for UDP fragmentation offload by network
 729         * device, so create one single skb packet containing complete
 730         * udp datagram
 731         */
 732        if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
 733                skb = sock_alloc_send_skb(sk,
 734                        hh_len + fragheaderlen + transhdrlen + 20,
 735                        (flags & MSG_DONTWAIT), &err);
 736
 737                if (skb == NULL)
 738                        return err;
 739
 740                /* reserve space for Hardware header */
 741                skb_reserve(skb, hh_len);
 742
 743                /* create space for UDP/IP header */
 744                skb_put(skb, fragheaderlen + transhdrlen);
 745
 746                /* initialize network header pointer */
 747                skb_reset_network_header(skb);
 748
 749                /* initialize protocol header pointer */
 750                skb->transport_header = skb->network_header + fragheaderlen;
 751
 752                skb->ip_summed = CHECKSUM_PARTIAL;
 753                skb->csum = 0;
 754                sk->sk_sndmsg_off = 0;
 755
 756                /* specify the length of each IP datagram fragment */
 757                skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
 758                skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
 759                __skb_queue_tail(&sk->sk_write_queue, skb);
 760        }
 761
 762        return skb_append_datato_frags(sk, skb, getfrag, from,
 763                                       (length - transhdrlen));
 764}
 765
 766/*
 767 *      ip_append_data() and ip_append_page() can make one large IP datagram
 768 *      from many pieces of data. Each pieces will be holded on the socket
 769 *      until ip_push_pending_frames() is called. Each piece can be a page
 770 *      or non-page data.
 771 *
 772 *      Not only UDP, other transport protocols - e.g. raw sockets - can use
 773 *      this interface potentially.
 774 *
 775 *      LATER: length must be adjusted by pad at tail, when it is required.
 776 */
 777int ip_append_data(struct sock *sk,
 778                   int getfrag(void *from, char *to, int offset, int len,
 779                               int odd, struct sk_buff *skb),
 780                   void *from, int length, int transhdrlen,
 781                   struct ipcm_cookie *ipc, struct rtable **rtp,
 782                   unsigned int flags)
 783{
 784        struct inet_sock *inet = inet_sk(sk);
 785        struct sk_buff *skb;
 786
 787        struct ip_options *opt = NULL;
 788        int hh_len;
 789        int exthdrlen;
 790        int mtu;
 791        int copy;
 792        int err;
 793        int offset = 0;
 794        unsigned int maxfraglen, fragheaderlen;
 795        int csummode = CHECKSUM_NONE;
 796        struct rtable *rt;
 797
 798        if (flags&MSG_PROBE)
 799                return 0;
 800
 801        if (skb_queue_empty(&sk->sk_write_queue)) {
 802                /*
 803                 * setup for corking.
 804                 */
 805                opt = ipc->opt;
 806                if (opt) {
 807                        if (inet->cork.opt == NULL) {
 808                                inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
 809                                if (unlikely(inet->cork.opt == NULL))
 810                                        return -ENOBUFS;
 811                        }
 812                        memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
 813                        inet->cork.flags |= IPCORK_OPT;
 814                        inet->cork.addr = ipc->addr;
 815                }
 816                rt = *rtp;
 817                if (unlikely(!rt))
 818                        return -EFAULT;
 819                /*
 820                 * We steal reference to this route, caller should not release it
 821                 */
 822                *rtp = NULL;
 823                inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
 824                                            rt->u.dst.dev->mtu :
 825                                            dst_mtu(rt->u.dst.path);
 826                inet->cork.dst = &rt->u.dst;
 827                inet->cork.length = 0;
 828                sk->sk_sndmsg_page = NULL;
 829                sk->sk_sndmsg_off = 0;
 830                if ((exthdrlen = rt->u.dst.header_len) != 0) {
 831                        length += exthdrlen;
 832                        transhdrlen += exthdrlen;
 833                }
 834        } else {
 835                rt = (struct rtable *)inet->cork.dst;
 836                if (inet->cork.flags & IPCORK_OPT)
 837                        opt = inet->cork.opt;
 838
 839                transhdrlen = 0;
 840                exthdrlen = 0;
 841                mtu = inet->cork.fragsize;
 842        }
 843        hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 844
 845        fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 846        maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 847
 848        if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
 849                ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
 850                return -EMSGSIZE;
 851        }
 852
 853        /*
 854         * transhdrlen > 0 means that this is the first fragment and we wish
 855         * it won't be fragmented in the future.
 856         */
 857        if (transhdrlen &&
 858            length + fragheaderlen <= mtu &&
 859            rt->u.dst.dev->features & NETIF_F_V4_CSUM &&
 860            !exthdrlen)
 861                csummode = CHECKSUM_PARTIAL;
 862
 863        inet->cork.length += length;
 864        if (((length> mtu) || !skb_queue_empty(&sk->sk_write_queue)) &&
 865            (sk->sk_protocol == IPPROTO_UDP) &&
 866            (rt->u.dst.dev->features & NETIF_F_UFO)) {
 867                err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
 868                                         fragheaderlen, transhdrlen, mtu,
 869                                         flags);
 870                if (err)
 871                        goto error;
 872                return 0;
 873        }
 874
 875        /* So, what's going on in the loop below?
 876         *
 877         * We use calculated fragment length to generate chained skb,
 878         * each of segments is IP fragment ready for sending to network after
 879         * adding appropriate IP header.
 880         */
 881
 882        if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 883                goto alloc_new_skb;
 884
 885        while (length > 0) {
 886                /* Check if the remaining data fits into current packet. */
 887                copy = mtu - skb->len;
 888                if (copy < length)
 889                        copy = maxfraglen - skb->len;
 890                if (copy <= 0) {
 891                        char *data;
 892                        unsigned int datalen;
 893                        unsigned int fraglen;
 894                        unsigned int fraggap;
 895                        unsigned int alloclen;
 896                        struct sk_buff *skb_prev;
 897alloc_new_skb:
 898                        skb_prev = skb;
 899                        if (skb_prev)
 900                                fraggap = skb_prev->len - maxfraglen;
 901                        else
 902                                fraggap = 0;
 903
 904                        /*
 905                         * If remaining data exceeds the mtu,
 906                         * we know we need more fragment(s).
 907                         */
 908                        datalen = length + fraggap;
 909                        if (datalen > mtu - fragheaderlen)
 910                                datalen = maxfraglen - fragheaderlen;
 911                        fraglen = datalen + fragheaderlen;
 912
 913                        if ((flags & MSG_MORE) &&
 914                            !(rt->u.dst.dev->features&NETIF_F_SG))
 915                                alloclen = mtu;
 916                        else
 917                                alloclen = datalen + fragheaderlen;
 918
 919                        /* The last fragment gets additional space at tail.
 920                         * Note, with MSG_MORE we overallocate on fragments,
 921                         * because we have no idea what fragment will be
 922                         * the last.
 923                         */
 924                        if (datalen == length + fraggap)
 925                                alloclen += rt->u.dst.trailer_len;
 926
 927                        if (transhdrlen) {
 928                                skb = sock_alloc_send_skb(sk,
 929                                                alloclen + hh_len + 15,
 930                                                (flags & MSG_DONTWAIT), &err);
 931                        } else {
 932                                skb = NULL;
 933                                if (atomic_read(&sk->sk_wmem_alloc) <=
 934                                    2 * sk->sk_sndbuf)
 935                                        skb = sock_wmalloc(sk,
 936                                                           alloclen + hh_len + 15, 1,
 937                                                           sk->sk_allocation);
 938                                if (unlikely(skb == NULL))
 939                                        err = -ENOBUFS;
 940                                else
 941                                        /* only the initial fragment is
 942                                           time stamped */
 943                                        ipc->shtx.flags = 0;
 944                        }
 945                        if (skb == NULL)
 946                                goto error;
 947
 948                        /*
 949                         *      Fill in the control structures
 950                         */
 951                        skb->ip_summed = csummode;
 952                        skb->csum = 0;
 953                        skb_reserve(skb, hh_len);
 954                        *skb_tx(skb) = ipc->shtx;
 955
 956                        /*
 957                         *      Find where to start putting bytes.
 958                         */
 959                        data = skb_put(skb, fraglen);
 960                        skb_set_network_header(skb, exthdrlen);
 961                        skb->transport_header = (skb->network_header +
 962                                                 fragheaderlen);
 963                        data += fragheaderlen;
 964
 965                        if (fraggap) {
 966                                skb->csum = skb_copy_and_csum_bits(
 967                                        skb_prev, maxfraglen,
 968                                        data + transhdrlen, fraggap, 0);
 969                                skb_prev->csum = csum_sub(skb_prev->csum,
 970                                                          skb->csum);
 971                                data += fraggap;
 972                                pskb_trim_unique(skb_prev, maxfraglen);
 973                        }
 974
 975                        copy = datalen - transhdrlen - fraggap;
 976                        if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 977                                err = -EFAULT;
 978                                kfree_skb(skb);
 979                                goto error;
 980                        }
 981
 982                        offset += copy;
 983                        length -= datalen - fraggap;
 984                        transhdrlen = 0;
 985                        exthdrlen = 0;
 986                        csummode = CHECKSUM_NONE;
 987
 988                        /*
 989                         * Put the packet on the pending queue.
 990                         */
 991                        __skb_queue_tail(&sk->sk_write_queue, skb);
 992                        continue;
 993                }
 994
 995                if (copy > length)
 996                        copy = length;
 997
 998                if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
 999                        unsigned int off;
1000
1001                        off = skb->len;
1002                        if (getfrag(from, skb_put(skb, copy),
1003                                        offset, copy, off, skb) < 0) {
1004                                __skb_trim(skb, off);
1005                                err = -EFAULT;
1006                                goto error;
1007                        }
1008                } else {
1009                        int i = skb_shinfo(skb)->nr_frags;
1010                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1011                        struct page *page = sk->sk_sndmsg_page;
1012                        int off = sk->sk_sndmsg_off;
1013                        unsigned int left;
1014
1015                        if (page && (left = PAGE_SIZE - off) > 0) {
1016                                if (copy >= left)
1017                                        copy = left;
1018                                if (page != frag->page) {
1019                                        if (i == MAX_SKB_FRAGS) {
1020                                                err = -EMSGSIZE;
1021                                                goto error;
1022                                        }
1023                                        get_page(page);
1024                                        skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1025                                        frag = &skb_shinfo(skb)->frags[i];
1026                                }
1027                        } else if (i < MAX_SKB_FRAGS) {
1028                                if (copy > PAGE_SIZE)
1029                                        copy = PAGE_SIZE;
1030                                page = alloc_pages(sk->sk_allocation, 0);
1031                                if (page == NULL)  {
1032                                        err = -ENOMEM;
1033                                        goto error;
1034                                }
1035                                sk->sk_sndmsg_page = page;
1036                                sk->sk_sndmsg_off = 0;
1037
1038                                skb_fill_page_desc(skb, i, page, 0, 0);
1039                                frag = &skb_shinfo(skb)->frags[i];
1040                        } else {
1041                                err = -EMSGSIZE;
1042                                goto error;
1043                        }
1044                        if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1045                                err = -EFAULT;
1046                                goto error;
1047                        }
1048                        sk->sk_sndmsg_off += copy;
1049                        frag->size += copy;
1050                        skb->len += copy;
1051                        skb->data_len += copy;
1052                        skb->truesize += copy;
1053                        atomic_add(copy, &sk->sk_wmem_alloc);
1054                }
1055                offset += copy;
1056                length -= copy;
1057        }
1058
1059        return 0;
1060
1061error:
1062        inet->cork.length -= length;
1063        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1064        return err;
1065}
1066
1067ssize_t ip_append_page(struct sock *sk, struct page *page,
1068                       int offset, size_t size, int flags)
1069{
1070        struct inet_sock *inet = inet_sk(sk);
1071        struct sk_buff *skb;
1072        struct rtable *rt;
1073        struct ip_options *opt = NULL;
1074        int hh_len;
1075        int mtu;
1076        int len;
1077        int err;
1078        unsigned int maxfraglen, fragheaderlen, fraggap;
1079
1080        if (inet->hdrincl)
1081                return -EPERM;
1082
1083        if (flags&MSG_PROBE)
1084                return 0;
1085
1086        if (skb_queue_empty(&sk->sk_write_queue))
1087                return -EINVAL;
1088
1089        rt = (struct rtable *)inet->cork.dst;
1090        if (inet->cork.flags & IPCORK_OPT)
1091                opt = inet->cork.opt;
1092
1093        if (!(rt->u.dst.dev->features&NETIF_F_SG))
1094                return -EOPNOTSUPP;
1095
1096        hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1097        mtu = inet->cork.fragsize;
1098
1099        fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1100        maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1101
1102        if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1103                ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1104                return -EMSGSIZE;
1105        }
1106
1107        if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1108                return -EINVAL;
1109
1110        inet->cork.length += size;
1111        if ((sk->sk_protocol == IPPROTO_UDP) &&
1112            (rt->u.dst.dev->features & NETIF_F_UFO)) {
1113                skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1114                skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1115        }
1116
1117
1118        while (size > 0) {
1119                int i;
1120
1121                if (skb_is_gso(skb))
1122                        len = size;
1123                else {
1124
1125                        /* Check if the remaining data fits into current packet. */
1126                        len = mtu - skb->len;
1127                        if (len < size)
1128                                len = maxfraglen - skb->len;
1129                }
1130                if (len <= 0) {
1131                        struct sk_buff *skb_prev;
1132                        int alloclen;
1133
1134                        skb_prev = skb;
1135                        fraggap = skb_prev->len - maxfraglen;
1136
1137                        alloclen = fragheaderlen + hh_len + fraggap + 15;
1138                        skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1139                        if (unlikely(!skb)) {
1140                                err = -ENOBUFS;
1141                                goto error;
1142                        }
1143
1144                        /*
1145                         *      Fill in the control structures
1146                         */
1147                        skb->ip_summed = CHECKSUM_NONE;
1148                        skb->csum = 0;
1149                        skb_reserve(skb, hh_len);
1150
1151                        /*
1152                         *      Find where to start putting bytes.
1153                         */
1154                        skb_put(skb, fragheaderlen + fraggap);
1155                        skb_reset_network_header(skb);
1156                        skb->transport_header = (skb->network_header +
1157                                                 fragheaderlen);
1158                        if (fraggap) {
1159                                skb->csum = skb_copy_and_csum_bits(skb_prev,
1160                                                                   maxfraglen,
1161                                                    skb_transport_header(skb),
1162                                                                   fraggap, 0);
1163                                skb_prev->csum = csum_sub(skb_prev->csum,
1164                                                          skb->csum);
1165                                pskb_trim_unique(skb_prev, maxfraglen);
1166                        }
1167
1168                        /*
1169                         * Put the packet on the pending queue.
1170                         */
1171                        __skb_queue_tail(&sk->sk_write_queue, skb);
1172                        continue;
1173                }
1174
1175                i = skb_shinfo(skb)->nr_frags;
1176                if (len > size)
1177                        len = size;
1178                if (skb_can_coalesce(skb, i, page, offset)) {
1179                        skb_shinfo(skb)->frags[i-1].size += len;
1180                } else if (i < MAX_SKB_FRAGS) {
1181                        get_page(page);
1182                        skb_fill_page_desc(skb, i, page, offset, len);
1183                } else {
1184                        err = -EMSGSIZE;
1185                        goto error;
1186                }
1187
1188                if (skb->ip_summed == CHECKSUM_NONE) {
1189                        __wsum csum;
1190                        csum = csum_page(page, offset, len);
1191                        skb->csum = csum_block_add(skb->csum, csum, skb->len);
1192                }
1193
1194                skb->len += len;
1195                skb->data_len += len;
1196                skb->truesize += len;
1197                atomic_add(len, &sk->sk_wmem_alloc);
1198                offset += len;
1199                size -= len;
1200        }
1201        return 0;
1202
1203error:
1204        inet->cork.length -= size;
1205        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1206        return err;
1207}
1208
1209static void ip_cork_release(struct inet_sock *inet)
1210{
1211        inet->cork.flags &= ~IPCORK_OPT;
1212        kfree(inet->cork.opt);
1213        inet->cork.opt = NULL;
1214        dst_release(inet->cork.dst);
1215        inet->cork.dst = NULL;
1216}
1217
1218/*
1219 *      Combined all pending IP fragments on the socket as one IP datagram
1220 *      and push them out.
1221 */
1222int ip_push_pending_frames(struct sock *sk)
1223{
1224        struct sk_buff *skb, *tmp_skb;
1225        struct sk_buff **tail_skb;
1226        struct inet_sock *inet = inet_sk(sk);
1227        struct net *net = sock_net(sk);
1228        struct ip_options *opt = NULL;
1229        struct rtable *rt = (struct rtable *)inet->cork.dst;
1230        struct iphdr *iph;
1231        __be16 df = 0;
1232        __u8 ttl;
1233        int err = 0;
1234
1235        if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1236                goto out;
1237        tail_skb = &(skb_shinfo(skb)->frag_list);
1238
1239        /* move skb->data to ip header from ext header */
1240        if (skb->data < skb_network_header(skb))
1241                __skb_pull(skb, skb_network_offset(skb));
1242        while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1243                __skb_pull(tmp_skb, skb_network_header_len(skb));
1244                *tail_skb = tmp_skb;
1245                tail_skb = &(tmp_skb->next);
1246                skb->len += tmp_skb->len;
1247                skb->data_len += tmp_skb->len;
1248                skb->truesize += tmp_skb->truesize;
1249                tmp_skb->destructor = NULL;
1250                tmp_skb->sk = NULL;
1251        }
1252
1253        /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1254         * to fragment the frame generated here. No matter, what transforms
1255         * how transforms change size of the packet, it will come out.
1256         */
1257        if (inet->pmtudisc < IP_PMTUDISC_DO)
1258                skb->local_df = 1;
1259
1260        /* DF bit is set when we want to see DF on outgoing frames.
1261         * If local_df is set too, we still allow to fragment this frame
1262         * locally. */
1263        if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1264            (skb->len <= dst_mtu(&rt->u.dst) &&
1265             ip_dont_fragment(sk, &rt->u.dst)))
1266                df = htons(IP_DF);
1267
1268        if (inet->cork.flags & IPCORK_OPT)
1269                opt = inet->cork.opt;
1270
1271        if (rt->rt_type == RTN_MULTICAST)
1272                ttl = inet->mc_ttl;
1273        else
1274                ttl = ip_select_ttl(inet, &rt->u.dst);
1275
1276        iph = (struct iphdr *)skb->data;
1277        iph->version = 4;
1278        iph->ihl = 5;
1279        if (opt) {
1280                iph->ihl += opt->optlen>>2;
1281                ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1282        }
1283        iph->tos = inet->tos;
1284        iph->frag_off = df;
1285        ip_select_ident(iph, &rt->u.dst, sk);
1286        iph->ttl = ttl;
1287        iph->protocol = sk->sk_protocol;
1288        iph->saddr = rt->rt_src;
1289        iph->daddr = rt->rt_dst;
1290
1291        skb->priority = sk->sk_priority;
1292        skb->mark = sk->sk_mark;
1293        /*
1294         * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1295         * on dst refcount
1296         */
1297        inet->cork.dst = NULL;
1298        skb_dst_set(skb, &rt->u.dst);
1299
1300        if (iph->protocol == IPPROTO_ICMP)
1301                icmp_out_count(net, ((struct icmphdr *)
1302                        skb_transport_header(skb))->type);
1303
1304        /* Netfilter gets whole the not fragmented skb. */
1305        err = ip_local_out(skb);
1306        if (err) {
1307                if (err > 0)
1308                        err = net_xmit_errno(err);
1309                if (err)
1310                        goto error;
1311        }
1312
1313out:
1314        ip_cork_release(inet);
1315        return err;
1316
1317error:
1318        IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1319        goto out;
1320}
1321
1322/*
1323 *      Throw away all pending data on the socket.
1324 */
1325void ip_flush_pending_frames(struct sock *sk)
1326{
1327        struct sk_buff *skb;
1328
1329        while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1330                kfree_skb(skb);
1331
1332        ip_cork_release(inet_sk(sk));
1333}
1334
1335
1336/*
1337 *      Fetch data from kernel space and fill in checksum if needed.
1338 */
1339static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1340                              int len, int odd, struct sk_buff *skb)
1341{
1342        __wsum csum;
1343
1344        csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1345        skb->csum = csum_block_add(skb->csum, csum, odd);
1346        return 0;
1347}
1348
1349/*
1350 *      Generic function to send a packet as reply to another packet.
1351 *      Used to send TCP resets so far. ICMP should use this function too.
1352 *
1353 *      Should run single threaded per socket because it uses the sock
1354 *      structure to pass arguments.
1355 */
1356void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1357                   unsigned int len)
1358{
1359        struct inet_sock *inet = inet_sk(sk);
1360        struct {
1361                struct ip_options       opt;
1362                char                    data[40];
1363        } replyopts;
1364        struct ipcm_cookie ipc;
1365        __be32 daddr;
1366        struct rtable *rt = skb_rtable(skb);
1367
1368        if (ip_options_echo(&replyopts.opt, skb))
1369                return;
1370
1371        daddr = ipc.addr = rt->rt_src;
1372        ipc.opt = NULL;
1373        ipc.shtx.flags = 0;
1374
1375        if (replyopts.opt.optlen) {
1376                ipc.opt = &replyopts.opt;
1377
1378                if (ipc.opt->srr)
1379                        daddr = replyopts.opt.faddr;
1380        }
1381
1382        {
1383                struct flowi fl = { .oif = arg->bound_dev_if,
1384                                    .nl_u = { .ip4_u =
1385                                              { .daddr = daddr,
1386                                                .saddr = rt->rt_spec_dst,
1387                                                .tos = RT_TOS(ip_hdr(skb)->tos) } },
1388                                    /* Not quite clean, but right. */
1389                                    .uli_u = { .ports =
1390                                               { .sport = tcp_hdr(skb)->dest,
1391                                                 .dport = tcp_hdr(skb)->source } },
1392                                    .proto = sk->sk_protocol,
1393                                    .flags = ip_reply_arg_flowi_flags(arg) };
1394                security_skb_classify_flow(skb, &fl);
1395                if (ip_route_output_key(sock_net(sk), &rt, &fl))
1396                        return;
1397        }
1398
1399        /* And let IP do all the hard work.
1400
1401           This chunk is not reenterable, hence spinlock.
1402           Note that it uses the fact, that this function is called
1403           with locally disabled BH and that sk cannot be already spinlocked.
1404         */
1405        bh_lock_sock(sk);
1406        inet->tos = ip_hdr(skb)->tos;
1407        sk->sk_priority = skb->priority;
1408        sk->sk_protocol = ip_hdr(skb)->protocol;
1409        sk->sk_bound_dev_if = arg->bound_dev_if;
1410        ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1411                       &ipc, &rt, MSG_DONTWAIT);
1412        if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1413                if (arg->csumoffset >= 0)
1414                        *((__sum16 *)skb_transport_header(skb) +
1415                          arg->csumoffset) = csum_fold(csum_add(skb->csum,
1416                                                                arg->csum));
1417                skb->ip_summed = CHECKSUM_NONE;
1418                ip_push_pending_frames(sk);
1419        }
1420
1421        bh_unlock_sock(sk);
1422
1423        ip_rt_put(rt);
1424}
1425
1426void __init ip_init(void)
1427{
1428        ip_rt_init();
1429        inet_initpeers();
1430
1431#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1432        igmp_mc_proc_init();
1433#endif
1434}
1435
1436EXPORT_SYMBOL(ip_generic_getfrag);
1437EXPORT_SYMBOL(ip_queue_xmit);
1438EXPORT_SYMBOL(ip_send_check);
1439
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.