linux/net/core/skbuff.c
<<
>>
Prefs
   1/*
   2 *      Routines having to do with the 'struct sk_buff' memory handlers.
   3 *
   4 *      Authors:        Alan Cox <alan@lxorguk.ukuu.org.uk>
   5 *                      Florian La Roche <rzsfl@rz.uni-sb.de>
   6 *
   7 *      Fixes:
   8 *              Alan Cox        :       Fixed the worst of the load
   9 *                                      balancer bugs.
  10 *              Dave Platt      :       Interrupt stacking fix.
  11 *      Richard Kooijman        :       Timestamp fixes.
  12 *              Alan Cox        :       Changed buffer format.
  13 *              Alan Cox        :       destructor hook for AF_UNIX etc.
  14 *              Linus Torvalds  :       Better skb_clone.
  15 *              Alan Cox        :       Added skb_copy.
  16 *              Alan Cox        :       Added all the changed routines Linus
  17 *                                      only put in the headers
  18 *              Ray VanTassle   :       Fixed --skb->lock in free
  19 *              Alan Cox        :       skb_copy copy arp field
  20 *              Andi Kleen      :       slabified it.
  21 *              Robert Olsson   :       Removed skb_head_pool
  22 *
  23 *      NOTE:
  24 *              The __skb_ routines should be called with interrupts
  25 *      disabled, or you better be *real* sure that the operation is atomic
  26 *      with respect to whatever list is being frobbed (e.g. via lock_sock()
  27 *      or via disabling bottom half handlers, etc).
  28 *
  29 *      This program is free software; you can redistribute it and/or
  30 *      modify it under the terms of the GNU General Public License
  31 *      as published by the Free Software Foundation; either version
  32 *      2 of the License, or (at your option) any later version.
  33 */
  34
  35/*
  36 *      The functions in this file will not compile correctly with gcc 2.4.x
  37 */
  38
  39#include <linux/module.h>
  40#include <linux/types.h>
  41#include <linux/kernel.h>
  42#include <linux/kmemcheck.h>
  43#include <linux/mm.h>
  44#include <linux/interrupt.h>
  45#include <linux/in.h>
  46#include <linux/inet.h>
  47#include <linux/slab.h>
  48#include <linux/netdevice.h>
  49#ifdef CONFIG_NET_CLS_ACT
  50#include <net/pkt_sched.h>
  51#endif
  52#include <linux/string.h>
  53#include <linux/skbuff.h>
  54#include <linux/splice.h>
  55#include <linux/cache.h>
  56#include <linux/rtnetlink.h>
  57#include <linux/init.h>
  58#include <linux/scatterlist.h>
  59#include <linux/errqueue.h>
  60#include <linux/prefetch.h>
  61
  62#include <net/protocol.h>
  63#include <net/dst.h>
  64#include <net/sock.h>
  65#include <net/checksum.h>
  66#include <net/xfrm.h>
  67
  68#include <asm/uaccess.h>
  69#include <asm/system.h>
  70#include <trace/events/skb.h>
  71
  72#include "kmap_skb.h"
  73
  74static struct kmem_cache *skbuff_head_cache __read_mostly;
  75static struct kmem_cache *skbuff_fclone_cache __read_mostly;
  76
  77static void sock_pipe_buf_release(struct pipe_inode_info *pipe,
  78                                  struct pipe_buffer *buf)
  79{
  80        put_page(buf->page);
  81}
  82
  83static void sock_pipe_buf_get(struct pipe_inode_info *pipe,
  84                                struct pipe_buffer *buf)
  85{
  86        get_page(buf->page);
  87}
  88
  89static int sock_pipe_buf_steal(struct pipe_inode_info *pipe,
  90                               struct pipe_buffer *buf)
  91{
  92        return 1;
  93}
  94
  95
  96/* Pipe buffer operations for a socket. */
  97static const struct pipe_buf_operations sock_pipe_buf_ops = {
  98        .can_merge = 0,
  99        .map = generic_pipe_buf_map,
 100        .unmap = generic_pipe_buf_unmap,
 101        .confirm = generic_pipe_buf_confirm,
 102        .release = sock_pipe_buf_release,
 103        .steal = sock_pipe_buf_steal,
 104        .get = sock_pipe_buf_get,
 105};
 106
 107/*
 108 *      Keep out-of-line to prevent kernel bloat.
 109 *      __builtin_return_address is not used because it is not always
 110 *      reliable.
 111 */
 112
 113/**
 114 *      skb_over_panic  -       private function
 115 *      @skb: buffer
 116 *      @sz: size
 117 *      @here: address
 118 *
 119 *      Out of line support code for skb_put(). Not user callable.
 120 */
 121static void skb_over_panic(struct sk_buff *skb, int sz, void *here)
 122{
 123        printk(KERN_EMERG "skb_over_panic: text:%p len:%d put:%d head:%p "
 124                          "data:%p tail:%#lx end:%#lx dev:%s\n",
 125               here, skb->len, sz, skb->head, skb->data,
 126               (unsigned long)skb->tail, (unsigned long)skb->end,
 127               skb->dev ? skb->dev->name : "<NULL>");
 128        BUG();
 129}
 130
 131/**
 132 *      skb_under_panic -       private function
 133 *      @skb: buffer
 134 *      @sz: size
 135 *      @here: address
 136 *
 137 *      Out of line support code for skb_push(). Not user callable.
 138 */
 139
 140static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
 141{
 142        printk(KERN_EMERG "skb_under_panic: text:%p len:%d put:%d head:%p "
 143                          "data:%p tail:%#lx end:%#lx dev:%s\n",
 144               here, skb->len, sz, skb->head, skb->data,
 145               (unsigned long)skb->tail, (unsigned long)skb->end,
 146               skb->dev ? skb->dev->name : "<NULL>");
 147        BUG();
 148}
 149
 150/*      Allocate a new skbuff. We do this ourselves so we can fill in a few
 151 *      'private' fields and also do memory statistics to find all the
 152 *      [BEEP] leaks.
 153 *
 154 */
 155
 156/**
 157 *      __alloc_skb     -       allocate a network buffer
 158 *      @size: size to allocate
 159 *      @gfp_mask: allocation mask
 160 *      @fclone: allocate from fclone cache instead of head cache
 161 *              and allocate a cloned (child) skb
 162 *      @node: numa node to allocate memory on
 163 *
 164 *      Allocate a new &sk_buff. The returned buffer has no headroom and a
 165 *      tail room of size bytes. The object has a reference count of one.
 166 *      The return is the buffer. On a failure the return is %NULL.
 167 *
 168 *      Buffers may only be allocated from interrupts using a @gfp_mask of
 169 *      %GFP_ATOMIC.
 170 */
 171struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 172                            int fclone, int node)
 173{
 174        struct kmem_cache *cache;
 175        struct skb_shared_info *shinfo;
 176        struct sk_buff *skb;
 177        u8 *data;
 178
 179        cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
 180
 181        /* Get the HEAD */
 182        skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
 183        if (!skb)
 184                goto out;
 185        prefetchw(skb);
 186
 187        /* We do our best to align skb_shared_info on a separate cache
 188         * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
 189         * aligned memory blocks, unless SLUB/SLAB debug is enabled.
 190         * Both skb->head and skb_shared_info are cache line aligned.
 191         */
 192        size = SKB_DATA_ALIGN(size);
 193        size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 194        data = kmalloc_node_track_caller(size, gfp_mask, node);
 195        if (!data)
 196                goto nodata;
 197        /* kmalloc(size) might give us more room than requested.
 198         * Put skb_shared_info exactly at the end of allocated zone,
 199         * to allow max possible filling before reallocation.
 200         */
 201        size = SKB_WITH_OVERHEAD(ksize(data));
 202        prefetchw(data + size);
 203
 204        /*
 205         * Only clear those fields we need to clear, not those that we will
 206         * actually initialise below. Hence, don't put any more fields after
 207         * the tail pointer in struct sk_buff!
 208         */
 209        memset(skb, 0, offsetof(struct sk_buff, tail));
 210        /* Account for allocated memory : skb + skb->head */
 211        skb->truesize = SKB_TRUESIZE(size);
 212        atomic_set(&skb->users, 1);
 213        skb->head = data;
 214        skb->data = data;
 215        skb_reset_tail_pointer(skb);
 216        skb->end = skb->tail + size;
 217#ifdef NET_SKBUFF_DATA_USES_OFFSET
 218        skb->mac_header = ~0U;
 219#endif
 220
 221        /* make sure we initialize shinfo sequentially */
 222        shinfo = skb_shinfo(skb);
 223        memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
 224        atomic_set(&shinfo->dataref, 1);
 225        kmemcheck_annotate_variable(shinfo->destructor_arg);
 226
 227        if (fclone) {
 228                struct sk_buff *child = skb + 1;
 229                atomic_t *fclone_ref = (atomic_t *) (child + 1);
 230
 231                kmemcheck_annotate_bitfield(child, flags1);
 232                kmemcheck_annotate_bitfield(child, flags2);
 233                skb->fclone = SKB_FCLONE_ORIG;
 234                atomic_set(fclone_ref, 1);
 235
 236                child->fclone = SKB_FCLONE_UNAVAILABLE;
 237        }
 238out:
 239        return skb;
 240nodata:
 241        kmem_cache_free(cache, skb);
 242        skb = NULL;
 243        goto out;
 244}
 245EXPORT_SYMBOL(__alloc_skb);
 246
 247/**
 248 *      __netdev_alloc_skb - allocate an skbuff for rx on a specific device
 249 *      @dev: network device to receive on
 250 *      @length: length to allocate
 251 *      @gfp_mask: get_free_pages mask, passed to alloc_skb
 252 *
 253 *      Allocate a new &sk_buff and assign it a usage count of one. The
 254 *      buffer has unspecified headroom built in. Users should allocate
 255 *      the headroom they think they need without accounting for the
 256 *      built in space. The built in space is used for optimisations.
 257 *
 258 *      %NULL is returned if there is no free memory.
 259 */
 260struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
 261                unsigned int length, gfp_t gfp_mask)
 262{
 263        struct sk_buff *skb;
 264
 265        skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE);
 266        if (likely(skb)) {
 267                skb_reserve(skb, NET_SKB_PAD);
 268                skb->dev = dev;
 269        }
 270        return skb;
 271}
 272EXPORT_SYMBOL(__netdev_alloc_skb);
 273
 274void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
 275                int size)
 276{
 277        skb_fill_page_desc(skb, i, page, off, size);
 278        skb->len += size;
 279        skb->data_len += size;
 280        skb->truesize += size;
 281}
 282EXPORT_SYMBOL(skb_add_rx_frag);
 283
 284/**
 285 *      dev_alloc_skb - allocate an skbuff for receiving
 286 *      @length: length to allocate
 287 *
 288 *      Allocate a new &sk_buff and assign it a usage count of one. The
 289 *      buffer has unspecified headroom built in. Users should allocate
 290 *      the headroom they think they need without accounting for the
 291 *      built in space. The built in space is used for optimisations.
 292 *
 293 *      %NULL is returned if there is no free memory. Although this function
 294 *      allocates memory it can be called from an interrupt.
 295 */
 296struct sk_buff *dev_alloc_skb(unsigned int length)
 297{
 298        /*
 299         * There is more code here than it seems:
 300         * __dev_alloc_skb is an inline
 301         */
 302        return __dev_alloc_skb(length, GFP_ATOMIC);
 303}
 304EXPORT_SYMBOL(dev_alloc_skb);
 305
 306static void skb_drop_list(struct sk_buff **listp)
 307{
 308        struct sk_buff *list = *listp;
 309
 310        *listp = NULL;
 311
 312        do {
 313                struct sk_buff *this = list;
 314                list = list->next;
 315                kfree_skb(this);
 316        } while (list);
 317}
 318
 319static inline void skb_drop_fraglist(struct sk_buff *skb)
 320{
 321        skb_drop_list(&skb_shinfo(skb)->frag_list);
 322}
 323
 324static void skb_clone_fraglist(struct sk_buff *skb)
 325{
 326        struct sk_buff *list;
 327
 328        skb_walk_frags(skb, list)
 329                skb_get(list);
 330}
 331
 332static void skb_release_data(struct sk_buff *skb)
 333{
 334        if (!skb->cloned ||
 335            !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
 336                               &skb_shinfo(skb)->dataref)) {
 337                if (skb_shinfo(skb)->nr_frags) {
 338                        int i;
 339                        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
 340                                skb_frag_unref(skb, i);
 341                }
 342
 343                /*
 344                 * If skb buf is from userspace, we need to notify the caller
 345                 * the lower device DMA has done;
 346                 */
 347                if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
 348                        struct ubuf_info *uarg;
 349
 350                        uarg = skb_shinfo(skb)->destructor_arg;
 351                        if (uarg->callback)
 352                                uarg->callback(uarg);
 353                }
 354
 355                if (skb_has_frag_list(skb))
 356                        skb_drop_fraglist(skb);
 357
 358                kfree(skb->head);
 359        }
 360}
 361
 362/*
 363 *      Free an skbuff by memory without cleaning the state.
 364 */
 365static void kfree_skbmem(struct sk_buff *skb)
 366{
 367        struct sk_buff *other;
 368        atomic_t *fclone_ref;
 369
 370        switch (skb->fclone) {
 371        case SKB_FCLONE_UNAVAILABLE:
 372                kmem_cache_free(skbuff_head_cache, skb);
 373                break;
 374
 375        case SKB_FCLONE_ORIG:
 376                fclone_ref = (atomic_t *) (skb + 2);
 377                if (atomic_dec_and_test(fclone_ref))
 378                        kmem_cache_free(skbuff_fclone_cache, skb);
 379                break;
 380
 381        case SKB_FCLONE_CLONE:
 382                fclone_ref = (atomic_t *) (skb + 1);
 383                other = skb - 1;
 384
 385                /* The clone portion is available for
 386                 * fast-cloning again.
 387                 */
 388                skb->fclone = SKB_FCLONE_UNAVAILABLE;
 389
 390                if (atomic_dec_and_test(fclone_ref))
 391                        kmem_cache_free(skbuff_fclone_cache, other);
 392                break;
 393        }
 394}
 395
 396static void skb_release_head_state(struct sk_buff *skb)
 397{
 398        skb_dst_drop(skb);
 399#ifdef CONFIG_XFRM
 400        secpath_put(skb->sp);
 401#endif
 402        if (skb->destructor) {
 403                WARN_ON(in_irq());
 404                skb->destructor(skb);
 405        }
 406#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 407        nf_conntrack_put(skb->nfct);
 408#endif
 409#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED
 410        nf_conntrack_put_reasm(skb->nfct_reasm);
 411#endif
 412#ifdef CONFIG_BRIDGE_NETFILTER
 413        nf_bridge_put(skb->nf_bridge);
 414#endif
 415/* XXX: IS this still necessary? - JHS */
 416#ifdef CONFIG_NET_SCHED
 417        skb->tc_index = 0;
 418#ifdef CONFIG_NET_CLS_ACT
 419        skb->tc_verd = 0;
 420#endif
 421#endif
 422}
 423
 424/* Free everything but the sk_buff shell. */
 425static void skb_release_all(struct sk_buff *skb)
 426{
 427        skb_release_head_state(skb);
 428        skb_release_data(skb);
 429}
 430
 431/**
 432 *      __kfree_skb - private function
 433 *      @skb: buffer
 434 *
 435 *      Free an sk_buff. Release anything attached to the buffer.
 436 *      Clean the state. This is an internal helper function. Users should
 437 *      always call kfree_skb
 438 */
 439
 440void __kfree_skb(struct sk_buff *skb)
 441{
 442        skb_release_all(skb);
 443        kfree_skbmem(skb);
 444}
 445EXPORT_SYMBOL(__kfree_skb);
 446
 447/**
 448 *      kfree_skb - free an sk_buff
 449 *      @skb: buffer to free
 450 *
 451 *      Drop a reference to the buffer and free it if the usage count has
 452 *      hit zero.
 453 */
 454void kfree_skb(struct sk_buff *skb)
 455{
 456        if (unlikely(!skb))
 457                return;
 458        if (likely(atomic_read(&skb->users) == 1))
 459                smp_rmb();
 460        else if (likely(!atomic_dec_and_test(&skb->users)))
 461                return;
 462        trace_kfree_skb(skb, __builtin_return_address(0));
 463        __kfree_skb(skb);
 464}
 465EXPORT_SYMBOL(kfree_skb);
 466
 467/**
 468 *      consume_skb - free an skbuff
 469 *      @skb: buffer to free
 470 *
 471 *      Drop a ref to the buffer and free it if the usage count has hit zero
 472 *      Functions identically to kfree_skb, but kfree_skb assumes that the frame
 473 *      is being dropped after a failure and notes that
 474 */
 475void consume_skb(struct sk_buff *skb)
 476{
 477        if (unlikely(!skb))
 478                return;
 479        if (likely(atomic_read(&skb->users) == 1))
 480                smp_rmb();
 481        else if (likely(!atomic_dec_and_test(&skb->users)))
 482                return;
 483        trace_consume_skb(skb);
 484        __kfree_skb(skb);
 485}
 486EXPORT_SYMBOL(consume_skb);
 487
 488/**
 489 *      skb_recycle - clean up an skb for reuse
 490 *      @skb: buffer
 491 *
 492 *      Recycles the skb to be reused as a receive buffer. This
 493 *      function does any necessary reference count dropping, and
 494 *      cleans up the skbuff as if it just came from __alloc_skb().
 495 */
 496void skb_recycle(struct sk_buff *skb)
 497{
 498        struct skb_shared_info *shinfo;
 499
 500        skb_release_head_state(skb);
 501
 502        shinfo = skb_shinfo(skb);
 503        memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
 504        atomic_set(&shinfo->dataref, 1);
 505
 506        memset(skb, 0, offsetof(struct sk_buff, tail));
 507        skb->data = skb->head + NET_SKB_PAD;
 508        skb_reset_tail_pointer(skb);
 509}
 510EXPORT_SYMBOL(skb_recycle);
 511
 512/**
 513 *      skb_recycle_check - check if skb can be reused for receive
 514 *      @skb: buffer
 515 *      @skb_size: minimum receive buffer size
 516 *
 517 *      Checks that the skb passed in is not shared or cloned, and
 518 *      that it is linear and its head portion at least as large as
 519 *      skb_size so that it can be recycled as a receive buffer.
 520 *      If these conditions are met, this function does any necessary
 521 *      reference count dropping and cleans up the skbuff as if it
 522 *      just came from __alloc_skb().
 523 */
 524bool skb_recycle_check(struct sk_buff *skb, int skb_size)
 525{
 526        if (!skb_is_recycleable(skb, skb_size))
 527                return false;
 528
 529        skb_recycle(skb);
 530
 531        return true;
 532}
 533EXPORT_SYMBOL(skb_recycle_check);
 534
 535static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 536{
 537        new->tstamp             = old->tstamp;
 538        new->dev                = old->dev;
 539        new->transport_header   = old->transport_header;
 540        new->network_header     = old->network_header;
 541        new->mac_header         = old->mac_header;
 542        skb_dst_copy(new, old);
 543        new->rxhash             = old->rxhash;
 544        new->ooo_okay           = old->ooo_okay;
 545        new->l4_rxhash          = old->l4_rxhash;
 546#ifdef CONFIG_XFRM
 547        new->sp                 = secpath_get(old->sp);
 548#endif
 549        memcpy(new->cb, old->cb, sizeof(old->cb));
 550        new->csum               = old->csum;
 551        new->local_df           = old->local_df;
 552        new->pkt_type           = old->pkt_type;
 553        new->ip_summed          = old->ip_summed;
 554        skb_copy_queue_mapping(new, old);
 555        new->priority           = old->priority;
 556#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 557        new->ipvs_property      = old->ipvs_property;
 558#endif
 559        new->protocol           = old->protocol;
 560        new->mark               = old->mark;
 561        new->skb_iif            = old->skb_iif;
 562        __nf_copy(new, old);
 563#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 564    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 565        new->nf_trace           = old->nf_trace;
 566#endif
 567#ifdef CONFIG_NET_SCHED
 568        new->tc_index           = old->tc_index;
 569#ifdef CONFIG_NET_CLS_ACT
 570        new->tc_verd            = old->tc_verd;
 571#endif
 572#endif
 573        new->vlan_tci           = old->vlan_tci;
 574
 575        skb_copy_secmark(new, old);
 576}
 577
 578/*
 579 * You should not add any new code to this function.  Add it to
 580 * __copy_skb_header above instead.
 581 */
 582static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
 583{
 584#define C(x) n->x = skb->x
 585
 586        n->next = n->prev = NULL;
 587        n->sk = NULL;
 588        __copy_skb_header(n, skb);
 589
 590        C(len);
 591        C(data_len);
 592        C(mac_len);
 593        n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
 594        n->cloned = 1;
 595        n->nohdr = 0;
 596        n->destructor = NULL;
 597        C(tail);
 598        C(end);
 599        C(head);
 600        C(data);
 601        C(truesize);
 602        atomic_set(&n->users, 1);
 603
 604        atomic_inc(&(skb_shinfo(skb)->dataref));
 605        skb->cloned = 1;
 606
 607        return n;
 608#undef C
 609}
 610
 611/**
 612 *      skb_morph       -       morph one skb into another
 613 *      @dst: the skb to receive the contents
 614 *      @src: the skb to supply the contents
 615 *
 616 *      This is identical to skb_clone except that the target skb is
 617 *      supplied by the user.
 618 *
 619 *      The target skb is returned upon exit.
 620 */
 621struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
 622{
 623        skb_release_all(dst);
 624        return __skb_clone(dst, src);
 625}
 626EXPORT_SYMBOL_GPL(skb_morph);
 627
 628/*      skb_copy_ubufs  -       copy userspace skb frags buffers to kernel
 629 *      @skb: the skb to modify
 630 *      @gfp_mask: allocation priority
 631 *
 632 *      This must be called on SKBTX_DEV_ZEROCOPY skb.
 633 *      It will copy all frags into kernel and drop the reference
 634 *      to userspace pages.
 635 *
 636 *      If this function is called from an interrupt gfp_mask() must be
 637 *      %GFP_ATOMIC.
 638 *
 639 *      Returns 0 on success or a negative error code on failure
 640 *      to allocate kernel memory to copy to.
 641 */
 642int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
 643{
 644        int i;
 645        int num_frags = skb_shinfo(skb)->nr_frags;
 646        struct page *page, *head = NULL;
 647        struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;
 648
 649        for (i = 0; i < num_frags; i++) {
 650                u8 *vaddr;
 651                skb_frag_t *f = &skb_shinfo(skb)->frags[i];
 652
 653                page = alloc_page(GFP_ATOMIC);
 654                if (!page) {
 655                        while (head) {
 656                                struct page *next = (struct page *)head->private;
 657                                put_page(head);
 658                                head = next;
 659                        }
 660                        return -ENOMEM;
 661                }
 662                vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
 663                memcpy(page_address(page),
 664                       vaddr + f->page_offset, skb_frag_size(f));
 665                kunmap_skb_frag(vaddr);
 666                page->private = (unsigned long)head;
 667                head = page;
 668        }
 669
 670        /* skb frags release userspace buffers */
 671        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
 672                skb_frag_unref(skb, i);
 673
 674        uarg->callback(uarg);
 675
 676        /* skb frags point to kernel buffers */
 677        for (i = skb_shinfo(skb)->nr_frags; i > 0; i--) {
 678                __skb_fill_page_desc(skb, i-1, head, 0,
 679                                     skb_shinfo(skb)->frags[i - 1].size);
 680                head = (struct page *)head->private;
 681        }
 682
 683        skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
 684        return 0;
 685}
 686
 687
 688/**
 689 *      skb_clone       -       duplicate an sk_buff
 690 *      @skb: buffer to clone
 691 *      @gfp_mask: allocation priority
 692 *
 693 *      Duplicate an &sk_buff. The new one is not owned by a socket. Both
 694 *      copies share the same packet data but not structure. The new
 695 *      buffer has a reference count of 1. If the allocation fails the
 696 *      function returns %NULL otherwise the new buffer is returned.
 697 *
 698 *      If this function is called from an interrupt gfp_mask() must be
 699 *      %GFP_ATOMIC.
 700 */
 701
 702struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 703{
 704        struct sk_buff *n;
 705
 706        if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
 707                if (skb_copy_ubufs(skb, gfp_mask))
 708                        return NULL;
 709        }
 710
 711        n = skb + 1;
 712        if (skb->fclone == SKB_FCLONE_ORIG &&
 713            n->fclone == SKB_FCLONE_UNAVAILABLE) {
 714                atomic_t *fclone_ref = (atomic_t *) (n + 1);
 715                n->fclone = SKB_FCLONE_CLONE;
 716                atomic_inc(fclone_ref);
 717        } else {
 718                n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
 719                if (!n)
 720                        return NULL;
 721
 722                kmemcheck_annotate_bitfield(n, flags1);
 723                kmemcheck_annotate_bitfield(n, flags2);
 724                n->fclone = SKB_FCLONE_UNAVAILABLE;
 725        }
 726
 727        return __skb_clone(n, skb);
 728}
 729EXPORT_SYMBOL(skb_clone);
 730
 731static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 732{
 733#ifndef NET_SKBUFF_DATA_USES_OFFSET
 734        /*
 735         *      Shift between the two data areas in bytes
 736         */
 737        unsigned long offset = new->data - old->data;
 738#endif
 739
 740        __copy_skb_header(new, old);
 741
 742#ifndef NET_SKBUFF_DATA_USES_OFFSET
 743        /* {transport,network,mac}_header are relative to skb->head */
 744        new->transport_header += offset;
 745        new->network_header   += offset;
 746        if (skb_mac_header_was_set(new))
 747                new->mac_header       += offset;
 748#endif
 749        skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
 750        skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
 751        skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
 752}
 753
 754/**
 755 *      skb_copy        -       create private copy of an sk_buff
 756 *      @skb: buffer to copy
 757 *      @gfp_mask: allocation priority
 758 *
 759 *      Make a copy of both an &sk_buff and its data. This is used when the
 760 *      caller wishes to modify the data and needs a private copy of the
 761 *      data to alter. Returns %NULL on failure or the pointer to the buffer
 762 *      on success. The returned buffer has a reference count of 1.
 763 *
 764 *      As by-product this function converts non-linear &sk_buff to linear
 765 *      one, so that &sk_buff becomes completely private and caller is allowed
 766 *      to modify all the data of returned buffer. This means that this
 767 *      function is not recommended for use in circumstances when only
 768 *      header is going to be modified. Use pskb_copy() instead.
 769 */
 770
 771struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
 772{
 773        int headerlen = skb_headroom(skb);
 774        unsigned int size = (skb_end_pointer(skb) - skb->head) + skb->data_len;
 775        struct sk_buff *n = alloc_skb(size, gfp_mask);
 776
 777        if (!n)
 778                return NULL;
 779
 780        /* Set the data pointer */
 781        skb_reserve(n, headerlen);
 782        /* Set the tail pointer and length */
 783        skb_put(n, skb->len);
 784
 785        if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len))
 786                BUG();
 787
 788        copy_skb_header(n, skb);
 789        return n;
 790}
 791EXPORT_SYMBOL(skb_copy);
 792
 793/**
 794 *      pskb_copy       -       create copy of an sk_buff with private head.
 795 *      @skb: buffer to copy
 796 *      @gfp_mask: allocation priority
 797 *
 798 *      Make a copy of both an &sk_buff and part of its data, located
 799 *      in header. Fragmented data remain shared. This is used when
 800 *      the caller wishes to modify only header of &sk_buff and needs
 801 *      private copy of the header to alter. Returns %NULL on failure
 802 *      or the pointer to the buffer on success.
 803 *      The returned buffer has a reference count of 1.
 804 */
 805
 806struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
 807{
 808        unsigned int size = skb_end_pointer(skb) - skb->head;
 809        struct sk_buff *n = alloc_skb(size, gfp_mask);
 810
 811        if (!n)
 812                goto out;
 813
 814        /* Set the data pointer */
 815        skb_reserve(n, skb_headroom(skb));
 816        /* Set the tail pointer and length */
 817        skb_put(n, skb_headlen(skb));
 818        /* Copy the bytes */
 819        skb_copy_from_linear_data(skb, n->data, n->len);
 820
 821        n->truesize += skb->data_len;
 822        n->data_len  = skb->data_len;
 823        n->len       = skb->len;
 824
 825        if (skb_shinfo(skb)->nr_frags) {
 826                int i;
 827
 828                if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
 829                        if (skb_copy_ubufs(skb, gfp_mask)) {
 830                                kfree_skb(n);
 831                                n = NULL;
 832                                goto out;
 833                        }
 834                }
 835                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 836                        skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
 837                        skb_frag_ref(skb, i);
 838                }
 839                skb_shinfo(n)->nr_frags = i;
 840        }
 841
 842        if (skb_has_frag_list(skb)) {
 843                skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
 844                skb_clone_fraglist(n);
 845        }
 846
 847        copy_skb_header(n, skb);
 848out:
 849        return n;
 850}
 851EXPORT_SYMBOL(pskb_copy);
 852
 853/**
 854 *      pskb_expand_head - reallocate header of &sk_buff
 855 *      @skb: buffer to reallocate
 856 *      @nhead: room to add at head
 857 *      @ntail: room to add at tail
 858 *      @gfp_mask: allocation priority
 859 *
 860 *      Expands (or creates identical copy, if &nhead and &ntail are zero)
 861 *      header of skb. &sk_buff itself is not changed. &sk_buff MUST have
 862 *      reference count of 1. Returns zero in the case of success or error,
 863 *      if expansion failed. In the last case, &sk_buff is not changed.
 864 *
 865 *      All the pointers pointing into skb header may change and must be
 866 *      reloaded after call to this function.
 867 */
 868
 869int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 870                     gfp_t gfp_mask)
 871{
 872        int i;
 873        u8 *data;
 874        int size = nhead + (skb_end_pointer(skb) - skb->head) + ntail;
 875        long off;
 876        bool fastpath;
 877
 878        BUG_ON(nhead < 0);
 879
 880        if (skb_shared(skb))
 881                BUG();
 882
 883        size = SKB_DATA_ALIGN(size);
 884
 885        /* Check if we can avoid taking references on fragments if we own
 886         * the last reference on skb->head. (see skb_release_data())
 887         */
 888        if (!skb->cloned)
 889                fastpath = true;
 890        else {
 891                int delta = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1;
 892                fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta;
 893        }
 894
 895        if (fastpath &&
 896            size + sizeof(struct skb_shared_info) <= ksize(skb->head)) {
 897                memmove(skb->head + size, skb_shinfo(skb),
 898                        offsetof(struct skb_shared_info,
 899                                 frags[skb_shinfo(skb)->nr_frags]));
 900                memmove(skb->head + nhead, skb->head,
 901                        skb_tail_pointer(skb) - skb->head);
 902                off = nhead;
 903                goto adjust_others;
 904        }
 905
 906        data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
 907        if (!data)
 908                goto nodata;
 909
 910        /* Copy only real data... and, alas, header. This should be
 911         * optimized for the cases when header is void.
 912         */
 913        memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);
 914
 915        memcpy((struct skb_shared_info *)(data + size),
 916               skb_shinfo(skb),
 917               offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
 918
 919        if (fastpath) {
 920                kfree(skb->head);
 921        } else {
 922                /* copy this zero copy skb frags */
 923                if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
 924                        if (skb_copy_ubufs(skb, gfp_mask))
 925                                goto nofrags;
 926                }
 927                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
 928                        skb_frag_ref(skb, i);
 929
 930                if (skb_has_frag_list(skb))
 931                        skb_clone_fraglist(skb);
 932
 933                skb_release_data(skb);
 934        }
 935        off = (data + nhead) - skb->head;
 936
 937        skb->head     = data;
 938adjust_others:
 939        skb->data    += off;
 940#ifdef NET_SKBUFF_DATA_USES_OFFSET
 941        skb->end      = size;
 942        off           = nhead;
 943#else
 944        skb->end      = skb->head + size;
 945#endif
 946        /* {transport,network,mac}_header and tail are relative to skb->head */
 947        skb->tail             += off;
 948        skb->transport_header += off;
 949        skb->network_header   += off;
 950        if (skb_mac_header_was_set(skb))
 951                skb->mac_header += off;
 952        /* Only adjust this if it actually is csum_start rather than csum */
 953        if (skb->ip_summed == CHECKSUM_PARTIAL)
 954                skb->csum_start += nhead;
 955        skb->cloned   = 0;
 956        skb->hdr_len  = 0;
 957        skb->nohdr    = 0;
 958        atomic_set(&skb_shinfo(skb)->dataref, 1);
 959        return 0;
 960
 961nofrags:
 962        kfree(data);
 963nodata:
 964        return -ENOMEM;
 965}
 966EXPORT_SYMBOL(pskb_expand_head);
 967
 968/* Make private copy of skb with writable head and some headroom */
 969
 970struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
 971{
 972        struct sk_buff *skb2;
 973        int delta = headroom - skb_headroom(skb);
 974
 975        if (delta <= 0)
 976                skb2 = pskb_copy(skb, GFP_ATOMIC);
 977        else {
 978                skb2 = skb_clone(skb, GFP_ATOMIC);
 979                if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
 980                                             GFP_ATOMIC)) {
 981                        kfree_skb(skb2);
 982                        skb2 = NULL;
 983                }
 984        }
 985        return skb2;
 986}
 987EXPORT_SYMBOL(skb_realloc_headroom);
 988
 989/**
 990 *      skb_copy_expand -       copy and expand sk_buff
 991 *      @skb: buffer to copy
 992 *      @newheadroom: new free bytes at head
 993 *      @newtailroom: new free bytes at tail
 994 *      @gfp_mask: allocation priority
 995 *
 996 *      Make a copy of both an &sk_buff and its data and while doing so
 997 *      allocate additional space.
 998 *
 999 *      This is used when the caller wishes to modify the data and needs a
1000 *      private copy of the data to alter as well as more space for new fields.
1001 *      Returns %NULL on failure or the pointer to the buffer
1002 *      on success. The returned buffer has a reference count of 1.
1003 *
1004 *      You must pass %GFP_ATOMIC as the allocation priority if this function
1005 *      is called from an interrupt.
1006 */
1007struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
1008                                int newheadroom, int newtailroom,
1009                                gfp_t gfp_mask)
1010{
1011        /*
1012         *      Allocate the copy buffer
1013         */
1014        struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom,
1015                                      gfp_mask);
1016        int oldheadroom = skb_headroom(skb);
1017        int head_copy_len, head_copy_off;
1018        int off;
1019
1020        if (!n)
1021                return NULL;
1022
1023        skb_reserve(n, newheadroom);
1024
1025        /* Set the tail pointer and length */
1026        skb_put(n, skb->len);
1027
1028        head_copy_len = oldheadroom;
1029        head_copy_off = 0;
1030        if (newheadroom <= head_copy_len)
1031                head_copy_len = newheadroom;
1032        else
1033                head_copy_off = newheadroom - head_copy_len;
1034
1035        /* Copy the linear header and data. */
1036        if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
1037                          skb->len + head_copy_len))
1038                BUG();
1039
1040        copy_skb_header(n, skb);
1041
1042        off                  = newheadroom - oldheadroom;
1043        if (n->ip_summed == CHECKSUM_PARTIAL)
1044                n->csum_start += off;
1045#ifdef NET_SKBUFF_DATA_USES_OFFSET
1046        n->transport_header += off;
1047        n->network_header   += off;
1048        if (skb_mac_header_was_set(skb))
1049                n->mac_header += off;
1050#endif
1051
1052        return n;
1053}
1054EXPORT_SYMBOL(skb_copy_expand);
1055
1056/**
1057 *      skb_pad                 -       zero pad the tail of an skb
1058 *      @skb: buffer to pad
1059 *      @pad: space to pad
1060 *
1061 *      Ensure that a buffer is followed by a padding area that is zero
1062 *      filled. Used by network drivers which may DMA or transfer data
1063 *      beyond the buffer end onto the wire.
1064 *
1065 *      May return error in out of memory cases. The skb is freed on error.
1066 */
1067
1068int skb_pad(struct sk_buff *skb, int pad)
1069{
1070        int err;
1071        int ntail;
1072
1073        /* If the skbuff is non linear tailroom is always zero.. */
1074        if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
1075                memset(skb->data+skb->len, 0, pad);
1076                return 0;
1077        }
1078
1079        ntail = skb->data_len + pad - (skb->end - skb->tail);
1080        if (likely(skb_cloned(skb) || ntail > 0)) {
1081                err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
1082                if (unlikely(err))
1083                        goto free_skb;
1084        }
1085
1086        /* FIXME: The use of this function with non-linear skb's really needs
1087         * to be audited.
1088         */
1089        err = skb_linearize(skb);
1090        if (unlikely(err))
1091                goto free_skb;
1092
1093        memset(skb->data + skb->len, 0, pad);
1094        return 0;
1095
1096free_skb:
1097        kfree_skb(skb);
1098        return err;
1099}
1100EXPORT_SYMBOL(skb_pad);
1101
1102/**
1103 *      skb_put - add data to a buffer
1104 *      @skb: buffer to use
1105 *      @len: amount of data to add
1106 *
1107 *      This function extends the used data area of the buffer. If this would
1108 *      exceed the total buffer size the kernel will panic. A pointer to the
1109 *      first byte of the extra data is returned.
1110 */
1111unsigned char *skb_put(struct sk_buff *skb, unsigned int len)
1112{
1113        unsigned char *tmp = skb_tail_pointer(skb);
1114        SKB_LINEAR_ASSERT(skb);
1115        skb->tail += len;
1116        skb->len  += len;
1117        if (unlikely(skb->tail > skb->end))
1118                skb_over_panic(skb, len, __builtin_return_address(0));
1119        return tmp;
1120}
1121EXPORT_SYMBOL(skb_put);
1122
1123/**
1124 *      skb_push - add data to the start of a buffer
1125 *      @skb: buffer to use
1126 *      @len: amount of data to add
1127 *
1128 *      This function extends the used data area of the buffer at the buffer
1129 *      start. If this would exceed the total buffer headroom the kernel will
1130 *      panic. A pointer to the first byte of the extra data is returned.
1131 */
1132unsigned char *skb_push(struct sk_buff *skb, unsigned int len)
1133{
1134        skb->data -= len;
1135        skb->len  += len;
1136        if (unlikely(skb->data<skb->head))
1137                skb_under_panic(skb, len, __builtin_return_address(0));
1138        return skb->data;
1139}
1140EXPORT_SYMBOL(skb_push);
1141
1142/**
1143 *      skb_pull - remove data from the start of a buffer
1144 *      @skb: buffer to use
1145 *      @len: amount of data to remove
1146 *
1147 *      This function removes data from the start of a buffer, returning
1148 *      the memory to the headroom. A pointer to the next data in the buffer
1149 *      is returned. Once the data has been pulled future pushes will overwrite
1150 *      the old data.
1151 */
1152unsigned char *skb_pull(struct sk_buff *skb, unsigned int len)
1153{
1154        return skb_pull_inline(skb, len);
1155}
1156EXPORT_SYMBOL(skb_pull);
1157
1158/**
1159 *      skb_trim - remove end from a buffer
1160 *      @skb: buffer to alter
1161 *      @len: new length
1162 *
1163 *      Cut the length of a buffer down by removing data from the tail. If
1164 *      the buffer is already under the length specified it is not modified.
1165 *      The skb must be linear.
1166 */
1167void skb_trim(struct sk_buff *skb, unsigned int len)
1168{
1169        if (skb->len > len)
1170                __skb_trim(skb, len);
1171}
1172EXPORT_SYMBOL(skb_trim);
1173
1174/* Trims skb to length len. It can change skb pointers.
1175 */
1176
1177int ___pskb_trim(struct sk_buff *skb, unsigned int len)
1178{
1179        struct sk_buff **fragp;
1180        struct sk_buff *frag;
1181        int offset = skb_headlen(skb);
1182        int nfrags = skb_shinfo(skb)->nr_frags;
1183        int i;
1184        int err;
1185
1186        if (skb_cloned(skb) &&
1187            unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
1188                return err;
1189
1190        i = 0;
1191        if (offset >= len)
1192                goto drop_pages;
1193
1194        for (; i < nfrags; i++) {
1195                int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]);
1196
1197                if (end < len) {
1198                        offset = end;
1199                        continue;
1200                }
1201
1202                skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset);
1203
1204drop_pages:
1205                skb_shinfo(skb)->nr_frags = i;
1206
1207                for (; i < nfrags; i++)
1208                        skb_frag_unref(skb, i);
1209
1210                if (skb_has_frag_list(skb))
1211                        skb_drop_fraglist(skb);
1212                goto done;
1213        }
1214
1215        for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp);
1216             fragp = &frag->next) {
1217                int end = offset + frag->len;
1218
1219                if (skb_shared(frag)) {
1220                        struct sk_buff *nfrag;
1221
1222                        nfrag = skb_clone(frag, GFP_ATOMIC);
1223                        if (unlikely(!nfrag))
1224                                return -ENOMEM;
1225
1226                        nfrag->next = frag->next;
1227                        kfree_skb(frag);
1228                        frag = nfrag;
1229                        *fragp = frag;
1230                }
1231
1232                if (end < len) {
1233                        offset = end;
1234                        continue;
1235                }
1236
1237                if (end > len &&
1238                    unlikely((err = pskb_trim(frag, len - offset))))
1239                        return err;
1240
1241                if (frag->next)
1242                        skb_drop_list(&frag->next);
1243                break;
1244        }
1245
1246done:
1247        if (len > skb_headlen(skb)) {
1248                skb->data_len -= skb->len - len;
1249                skb->len       = len;
1250        } else {
1251                skb->len       = len;
1252                skb->data_len  = 0;
1253                skb_set_tail_pointer(skb, len);
1254        }
1255
1256        return 0;
1257}
1258EXPORT_SYMBOL(___pskb_trim);
1259
1260/**
1261 *      __pskb_pull_tail - advance tail of skb header
1262 *      @skb: buffer to reallocate
1263 *      @delta: number of bytes to advance tail
1264 *
1265 *      The function makes a sense only on a fragmented &sk_buff,
1266 *      it expands header moving its tail forward and copying necessary
1267 *      data from fragmented part.
1268 *
1269 *      &sk_buff MUST have reference count of 1.
1270 *
1271 *      Returns %NULL (and &sk_buff does not change) if pull failed
1272 *      or value of new tail of skb in the case of success.
1273 *
1274 *      All the pointers pointing into skb header may change and must be
1275 *      reloaded after call to this function.
1276 */
1277
1278/* Moves tail of skb head forward, copying data from fragmented part,
1279 * when it is necessary.
1280 * 1. It may fail due to malloc failure.
1281 * 2. It may change skb pointers.
1282 *
1283 * It is pretty complicated. Luckily, it is called only in exceptional cases.
1284 */
1285unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
1286{
1287        /* If skb has not enough free space at tail, get new one
1288         * plus 128 bytes for future expansions. If we have enough
1289         * room at tail, reallocate without expansion only if skb is cloned.
1290         */
1291        int i, k, eat = (skb->tail + delta) - skb->end;
1292
1293        if (eat > 0 || skb_cloned(skb)) {
1294                if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
1295                                     GFP_ATOMIC))
1296                        return NULL;
1297        }
1298
1299        if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta))
1300                BUG();
1301
1302        /* Optimization: no fragments, no reasons to preestimate
1303         * size of pulled pages. Superb.
1304         */
1305        if (!skb_has_frag_list(skb))
1306                goto pull_pages;
1307
1308        /* Estimate size of pulled pages. */
1309        eat = delta;
1310        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1311                int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
1312
1313                if (size >= eat)
1314                        goto pull_pages;
1315                eat -= size;
1316        }
1317
1318        /* If we need update frag list, we are in troubles.
1319         * Certainly, it possible to add an offset to skb data,
1320         * but taking into account that pulling is expected to
1321         * be very rare operation, it is worth to fight against
1322         * further bloating skb head and crucify ourselves here instead.
1323         * Pure masohism, indeed. 8)8)
1324         */
1325        if (eat) {
1326                struct sk_buff *list = skb_shinfo(skb)->frag_list;
1327                struct sk_buff *clone = NULL;
1328                struct sk_buff *insp = NULL;
1329
1330                do {
1331                        BUG_ON(!list);
1332
1333                        if (list->len <= eat) {
1334                                /* Eaten as whole. */
1335                                eat -= list->len;
1336                                list = list->next;
1337                                insp = list;
1338                        } else {
1339                                /* Eaten partially. */
1340
1341                                if (skb_shared(list)) {
1342                                        /* Sucks! We need to fork list. :-( */
1343                                        clone = skb_clone(list, GFP_ATOMIC);
1344                                        if (!clone)
1345                                                return NULL;
1346                                        insp = list->next;
1347                                        list = clone;
1348                                } else {
1349                                        /* This may be pulled without
1350                                         * problems. */
1351                                        insp = list;
1352                                }
1353                                if (!pskb_pull(list, eat)) {
1354                                        kfree_skb(clone);
1355                                        return NULL;
1356                                }
1357                                break;
1358                        }
1359                } while (eat);
1360
1361                /* Free pulled out fragments. */
1362                while ((list = skb_shinfo(skb)->frag_list) != insp) {
1363                        skb_shinfo(skb)->frag_list = list->next;
1364                        kfree_skb(list);
1365                }
1366                /* And insert new clone at head. */
1367                if (clone) {
1368                        clone->next = list;
1369                        skb_shinfo(skb)->frag_list = clone;
1370                }
1371        }
1372        /* Success! Now we may commit changes to skb data. */
1373
1374pull_pages:
1375        eat = delta;
1376        k = 0;
1377        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1378                int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
1379
1380                if (size <= eat) {
1381                        skb_frag_unref(skb, i);
1382                        eat -= size;
1383                } else {
1384                        skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
1385                        if (eat) {
1386                                skb_shinfo(skb)->frags[k].page_offset += eat;
1387                                skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat);
1388                                eat = 0;
1389                        }
1390                        k++;
1391                }
1392        }
1393        skb_shinfo(skb)->nr_frags = k;
1394
1395        skb->tail     += delta;
1396        skb->data_len -= delta;
1397
1398        return skb_tail_pointer(skb);
1399}
1400EXPORT_SYMBOL(__pskb_pull_tail);
1401
1402/**
1403 *      skb_copy_bits - copy bits from skb to kernel buffer
1404 *      @skb: source skb
1405 *      @offset: offset in source
1406 *      @to: destination buffer
1407 *      @len: number of bytes to copy
1408 *
1409 *      Copy the specified number of bytes from the source skb to the
1410 *      destination buffer.
1411 *
1412 *      CAUTION ! :
1413 *              If its prototype is ever changed,
1414 *              check arch/{*}/net/{*}.S files,
1415 *              since it is called from BPF assembly code.
1416 */
1417int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
1418{
1419        int start = skb_headlen(skb);
1420        struct sk_buff *frag_iter;
1421        int i, copy;
1422
1423        if (offset > (int)skb->len - len)
1424                goto fault;
1425
1426        /* Copy header. */
1427        if ((copy = start - offset) > 0) {
1428                if (copy > len)
1429                        copy = len;
1430                skb_copy_from_linear_data_offset(skb, offset, to, copy);
1431                if ((len -= copy) == 0)
1432                        return 0;
1433                offset += copy;
1434                to     += copy;
1435        }
1436
1437        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1438                int end;
1439
1440                WARN_ON(start > offset + len);
1441
1442                end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
1443                if ((copy = end - offset) > 0) {
1444                        u8 *vaddr;
1445
1446                        if (copy > len)
1447                                copy = len;
1448
1449                        vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
1450                        memcpy(to,
1451                               vaddr + skb_shinfo(skb)->frags[i].page_offset+
1452                               offset - start, copy);
1453                        kunmap_skb_frag(vaddr);
1454
1455                        if ((len -= copy) == 0)
1456                                return 0;
1457                        offset += copy;
1458                        to     += copy;
1459                }
1460                start = end;
1461        }
1462
1463        skb_walk_frags(skb, frag_iter) {
1464                int end;
1465
1466                WARN_ON(start > offset + len);
1467
1468                end = start + frag_iter->len;
1469                if ((copy = end - offset) > 0) {
1470                        if (copy > len)
1471                                copy = len;
1472                        if (skb_copy_bits(frag_iter, offset - start, to, copy))
1473                                goto fault;
1474                        if ((len -= copy) == 0)
1475                                return 0;
1476                        offset += copy;
1477                        to     += copy;
1478                }
1479                start = end;
1480        }
1481
1482        if (!len)
1483                return 0;
1484
1485fault:
1486        return -EFAULT;
1487}
1488EXPORT_SYMBOL(skb_copy_bits);
1489
1490/*
1491 * Callback from splice_to_pipe(), if we need to release some pages
1492 * at the end of the spd in case we error'ed out in filling the pipe.
1493 */
1494static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
1495{
1496        put_page(spd->pages[i]);
1497}
1498
1499static inline struct page *linear_to_page(struct page *page, unsigned int *len,
1500                                          unsigned int *offset,
1501                                          struct sk_buff *skb, struct sock *sk)
1502{
1503        struct page *p = sk->sk_sndmsg_page;
1504        unsigned int off;
1505
1506        if (!p) {
1507new_page:
1508                p = sk->sk_sndmsg_page = alloc_pages(sk->sk_allocation, 0);
1509                if (!p)
1510                        return NULL;
1511
1512                off = sk->sk_sndmsg_off = 0;
1513                /* hold one ref to this page until it's full */
1514        } else {
1515                unsigned int mlen;
1516
1517                off = sk->sk_sndmsg_off;
1518                mlen = PAGE_SIZE - off;
1519                if (mlen < 64 && mlen < *len) {
1520                        put_page(p);
1521                        goto new_page;
1522                }
1523
1524                *len = min_t(unsigned int, *len, mlen);
1525        }
1526
1527        memcpy(page_address(p) + off, page_address(page) + *offset, *len);
1528        sk->sk_sndmsg_off += *len;
1529        *offset = off;
1530        get_page(p);
1531
1532        return p;
1533}
1534
1535/*
1536 * Fill page/offset/length into spd, if it can hold more pages.
1537 */
1538static inline int spd_fill_page(struct splice_pipe_desc *spd,
1539                                struct pipe_inode_info *pipe, struct page *page,
1540                                unsigned int *len, unsigned int offset,
1541                                struct sk_buff *skb, int linear,
1542                                struct sock *sk)
1543{
1544        if (unlikely(spd->nr_pages == pipe->buffers))
1545                return 1;
1546
1547        if (linear) {
1548                page = linear_to_page(page, len, &offset, skb, sk);
1549                if (!page)
1550                        return 1;
1551        } else
1552                get_page(page);
1553
1554        spd->pages[spd->nr_pages] = page;
1555        spd->partial[spd->nr_pages].len = *len;
1556        spd->partial[spd->nr_pages].offset = offset;
1557        spd->nr_pages++;
1558
1559        return 0;
1560}
1561
1562static inline void __segment_seek(struct page **page, unsigned int *poff,
1563                                  unsigned int *plen, unsigned int off)
1564{
1565        unsigned long n;
1566
1567        *poff += off;
1568        n = *poff / PAGE_SIZE;
1569        if (n)
1570                *page = nth_page(*page, n);
1571
1572        *poff = *poff % PAGE_SIZE;
1573        *plen -= off;
1574}
1575
1576static inline int __splice_segment(struct page *page, unsigned int poff,
1577                                   unsigned int plen, unsigned int *off,
1578                                   unsigned int *len, struct sk_buff *skb,
1579                                   struct splice_pipe_desc *spd, int linear,
1580                                   struct sock *sk,
1581                                   struct pipe_inode_info *pipe)
1582{
1583        if (!*len)
1584                return 1;
1585
1586        /* skip this segment if already processed */
1587        if (*off >= plen) {
1588                *off -= plen;
1589                return 0;
1590        }
1591
1592        /* ignore any bits we already processed */
1593        if (*off) {
1594                __segment_seek(&page, &poff, &plen, *off);
1595                *off = 0;
1596        }
1597
1598        do {
1599                unsigned int flen = min(*len, plen);
1600
1601                /* the linear region may spread across several pages  */
1602                flen = min_t(unsigned int, flen, PAGE_SIZE - poff);
1603
1604                if (spd_fill_page(spd, pipe, page, &flen, poff, skb, linear, sk))
1605                        return 1;
1606
1607                __segment_seek(&page, &poff, &plen, flen);
1608                *len -= flen;
1609
1610        } while (*len && plen);
1611
1612        return 0;
1613}
1614
1615/*
1616 * Map linear and fragment data from the skb to spd. It reports failure if the
1617 * pipe is full or if we already spliced the requested length.
1618 */
1619static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
1620                             unsigned int *offset, unsigned int *len,
1621                             struct splice_pipe_desc *spd, struct sock *sk)
1622{
1623        int seg;
1624
1625        /*
1626         * map the linear part
1627         */
1628        if (__splice_segment(virt_to_page(skb->data),
1629                             (unsigned long) skb->data & (PAGE_SIZE - 1),
1630                             skb_headlen(skb),
1631                             offset, len, skb, spd, 1, sk, pipe))
1632                return 1;
1633
1634        /*
1635         * then map the fragments
1636         */
1637        for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
1638                const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
1639
1640                if (__splice_segment(skb_frag_page(f),
1641                                     f->page_offset, skb_frag_size(f),
1642                                     offset, len, skb, spd, 0, sk, pipe))
1643                        return 1;
1644        }
1645
1646        return 0;
1647}
1648
1649/*
1650 * Map data from the skb to a pipe. Should handle both the linear part,
1651 * the fragments, and the frag list. It does NOT handle frag lists within
1652 * the frag list, if such a thing exists. We'd probably need to recurse to
1653 * handle that cleanly.
1654 */
1655int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
1656                    struct pipe_inode_info *pipe, unsigned int tlen,
1657                    unsigned int flags)
1658{
1659        struct partial_page partial[PIPE_DEF_BUFFERS];
1660        struct page *pages[PIPE_DEF_BUFFERS];
1661        struct splice_pipe_desc spd = {
1662                .pages = pages,
1663                .partial = partial,
1664                .flags = flags,
1665                .ops = &sock_pipe_buf_ops,
1666                .spd_release = sock_spd_release,
1667        };
1668        struct sk_buff *frag_iter;
1669        struct sock *sk = skb->sk;
1670        int ret = 0;
1671
1672        if (splice_grow_spd(pipe, &spd))
1673                return -ENOMEM;
1674
1675        /*
1676         * __skb_splice_bits() only fails if the output has no room left,
1677         * so no point in going over the frag_list for the error case.
1678         */
1679        if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk))
1680                goto done;
1681        else if (!tlen)
1682                goto done;
1683
1684        /*
1685         * now see if we have a frag_list to map
1686         */
1687        skb_walk_frags(skb, frag_iter) {
1688                if (!tlen)
1689                        break;
1690                if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk))
1691                        break;
1692        }
1693
1694done:
1695        if (spd.nr_pages) {
1696                /*
1697                 * Drop the socket lock, otherwise we have reverse
1698                 * locking dependencies between sk_lock and i_mutex
1699                 * here as compared to sendfile(). We enter here
1700                 * with the socket lock held, and splice_to_pipe() will
1701                 * grab the pipe inode lock. For sendfile() emulation,
1702                 * we call into ->sendpage() with the i_mutex lock held
1703                 * and networking will grab the socket lock.
1704                 */
1705                release_sock(sk);
1706                ret = splice_to_pipe(pipe, &spd);
1707                lock_sock(sk);
1708        }
1709
1710        splice_shrink_spd(pipe, &spd);
1711        return ret;
1712}
1713
1714/**
1715 *      skb_store_bits - store bits from kernel buffer to skb
1716 *      @skb: destination buffer
1717 *      @offset: offset in destination
1718 *      @from: source buffer
1719 *      @len: number of bytes to copy
1720 *
1721 *      Copy the specified number of bytes from the source buffer to the
1722 *      destination skb.  This function handles all the messy bits of
1723 *      traversing fragment lists and such.
1724 */
1725
1726int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
1727{
1728        int start = skb_headlen(skb);
1729        struct sk_buff *frag_iter;
1730        int i, copy;
1731
1732        if (offset > (int)skb->len - len)
1733                goto fault;
1734
1735        if ((copy = start - offset) > 0) {
1736                if (copy > len)
1737                        copy = len;
1738                skb_copy_to_linear_data_offset(skb, offset, from, copy);
1739                if ((len -= copy) == 0)
1740                        return 0;
1741                offset += copy;
1742                from += copy;
1743        }
1744
1745        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1746                skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
1747                int end;
1748
1749                WARN_ON(start > offset + len);
1750
1751                end = start + skb_frag_size(frag);
1752                if ((copy = end - offset) > 0) {
1753                        u8 *vaddr;
1754
1755                        if (copy > len)
1756                                copy = len;
1757
1758                        vaddr = kmap_skb_frag(frag);
1759                        memcpy(vaddr + frag->page_offset + offset - start,
1760                               from, copy);
1761                        kunmap_skb_frag(vaddr);
1762
1763                        if ((len -= copy) == 0)
1764                                return 0;
1765                        offset += copy;
1766                        from += copy;
1767                }
1768                start = end;
1769        }
1770
1771        skb_walk_frags(skb, frag_iter) {
1772                int end;
1773
1774                WARN_ON(start > offset + len);
1775
1776                end = start + frag_iter->len;
1777                if ((copy = end - offset) > 0) {
1778                        if (copy > len)
1779                                copy = len;
1780                        if (skb_store_bits(frag_iter, offset - start,
1781                                           from, copy))
1782                                goto fault;
1783                        if ((len -= copy) == 0)
1784                                return 0;
1785                        offset += copy;
1786                        from += copy;
1787                }
1788                start = end;
1789        }
1790        if (!len)
1791                return 0;
1792
1793fault:
1794        return -EFAULT;
1795}
1796EXPORT_SYMBOL(skb_store_bits);
1797
1798/* Checksum skb data. */
1799
1800__wsum skb_checksum(const struct sk_buff *skb, int offset,
1801                          int len, __wsum csum)
1802{
1803        int start = skb_headlen(skb);
1804        int i, copy = start - offset;
1805        struct sk_buff *frag_iter;
1806        int pos = 0;
1807
1808        /* Checksum header. */
1809        if (copy > 0) {
1810                if (copy > len)
1811                        copy = len;
1812                csum = csum_partial(skb->data + offset, copy, csum);
1813                if ((len -= copy) == 0)
1814                        return csum;
1815                offset += copy;
1816                pos     = copy;
1817        }
1818
1819        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1820                int end;
1821
1822                WARN_ON(start > offset + len);
1823
1824                end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
1825                if ((copy = end - offset) > 0) {
1826                        __wsum csum2;
1827                        u8 *vaddr;
1828                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
1829
1830                        if (copy > len)
1831                                copy = len;
1832                        vaddr = kmap_skb_frag(frag);
1833                        csum2 = csum_partial(vaddr + frag->page_offset +
1834                                             offset - start, copy, 0);
1835                        kunmap_skb_frag(vaddr);
1836                        csum = csum_block_add(csum, csum2, pos);
1837                        if (!(len -= copy))
1838                                return csum;
1839                        offset += copy;
1840                        pos    += copy;
1841                }
1842                start = end;
1843        }
1844
1845        skb_walk_frags(skb, frag_iter) {
1846                int end;
1847
1848                WARN_ON(start > offset + len);
1849
1850                end = start + frag_iter->len;
1851                if ((copy = end - offset) > 0) {
1852                        __wsum csum2;
1853                        if (copy > len)
1854                                copy = len;
1855                        csum2 = skb_checksum(frag_iter, offset - start,
1856                                             copy, 0);
1857                        csum = csum_block_add(csum, csum2, pos);
1858                        if ((len -= copy) == 0)
1859                                return csum;
1860                        offset += copy;
1861                        pos    += copy;
1862                }
1863                start = end;
1864        }
1865        BUG_ON(len);
1866
1867        return csum;
1868}
1869EXPORT_SYMBOL(skb_checksum);
1870
1871/* Both of above in one bottle. */
1872
1873__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
1874                                    u8 *to, int len, __wsum csum)
1875{
1876        int start = skb_headlen(skb);
1877        int i, copy = start - offset;
1878        struct sk_buff *frag_iter;
1879        int pos = 0;
1880
1881        /* Copy header. */
1882        if (copy > 0) {
1883                if (copy > len)
1884                        copy = len;
1885                csum = csum_partial_copy_nocheck(skb->data + offset, to,
1886                                                 copy, csum);
1887                if ((len -= copy) == 0)
1888                        return csum;
1889                offset += copy;
1890                to     += copy;
1891                pos     = copy;
1892        }
1893
1894        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1895                int end;
1896
1897                WARN_ON(start > offset + len);
1898
1899                end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
1900                if ((copy = end - offset) > 0) {
1901                        __wsum csum2;
1902                        u8 *vaddr;
1903                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
1904
1905                        if (copy > len)
1906                                copy = len;
1907                        vaddr = kmap_skb_frag(frag);
1908                        csum2 = csum_partial_copy_nocheck(vaddr +
1909                                                          frag->page_offset +
1910                                                          offset - start, to,
1911                                                          copy, 0);
1912                        kunmap_skb_frag(vaddr);
1913                        csum = csum_block_add(csum, csum2, pos);
1914                        if (!(len -= copy))
1915                                return csum;
1916                        offset += copy;
1917                        to     += copy;
1918                        pos    += copy;
1919                }
1920                start = end;
1921        }
1922
1923        skb_walk_frags(skb, frag_iter) {
1924                __wsum csum2;
1925                int end;
1926
1927                WARN_ON(start > offset + len);
1928
1929                end = start + frag_iter->len;
1930                if ((copy = end - offset) > 0) {
1931                        if (copy > len)
1932                                copy = len;
1933                        csum2 = skb_copy_and_csum_bits(frag_iter,
1934                                                       offset - start,
1935                                                       to, copy, 0);
1936                        csum = csum_block_add(csum, csum2, pos);
1937                        if ((len -= copy) == 0)
1938                                return csum;
1939                        offset += copy;
1940                        to     += copy;
1941                        pos    += copy;
1942                }
1943                start = end;
1944        }
1945        BUG_ON(len);
1946        return csum;
1947}
1948EXPORT_SYMBOL(skb_copy_and_csum_bits);
1949
1950void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
1951{
1952        __wsum csum;
1953        long csstart;
1954
1955        if (skb->ip_summed == CHECKSUM_PARTIAL)
1956                csstart = skb_checksum_start_offset(skb);
1957        else
1958                csstart = skb_headlen(skb);
1959
1960        BUG_ON(csstart > skb_headlen(skb));
1961
1962        skb_copy_from_linear_data(skb, to, csstart);
1963
1964        csum = 0;
1965        if (csstart != skb->len)
1966                csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
1967                                              skb->len - csstart, 0);
1968
1969        if (skb->ip_summed == CHECKSUM_PARTIAL) {
1970                long csstuff = csstart + skb->csum_offset;
1971
1972                *((__sum16 *)(to + csstuff)) = csum_fold(csum);
1973        }
1974}
1975EXPORT_SYMBOL(skb_copy_and_csum_dev);
1976
1977/**
1978 *      skb_dequeue - remove from the head of the queue
1979 *      @list: list to dequeue from
1980 *
1981 *      Remove the head of the list. The list lock is taken so the function
1982 *      may be used safely with other locking list functions. The head item is
1983 *      returned or %NULL if the list is empty.
1984 */
1985
1986struct sk_buff *skb_dequeue(struct sk_buff_head *list)
1987{
1988        unsigned long flags;
1989        struct sk_buff *result;
1990
1991        spin_lock_irqsave(&list->lock, flags);
1992        result = __skb_dequeue(list);
1993        spin_unlock_irqrestore(&list->lock, flags);
1994        return result;
1995}
1996EXPORT_SYMBOL(skb_dequeue);
1997
1998/**
1999 *      skb_dequeue_tail - remove from the tail of the queue
2000 *      @list: list to dequeue from
2001 *
2002 *      Remove the tail of the list. The list lock is taken so the function
2003 *      may be used safely with other locking list functions. The tail item is
2004 *      returned or %NULL if the list is empty.
2005 */
2006struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
2007{
2008        unsigned long flags;
2009        struct sk_buff *result;
2010
2011        spin_lock_irqsave(&list->lock, flags);
2012        result = __skb_dequeue_tail(list);
2013        spin_unlock_irqrestore(&list->lock, flags);
2014        return result;
2015}
2016EXPORT_SYMBOL(skb_dequeue_tail);
2017
2018/**
2019 *      skb_queue_purge - empty a list
2020 *      @list: list to empty
2021 *
2022 *      Delete all buffers on an &sk_buff list. Each buffer is removed from
2023 *      the list and one reference dropped. This function takes the list
2024 *      lock and is atomic with respect to other list locking functions.
2025 */
2026void skb_queue_purge(struct sk_buff_head *list)
2027{
2028        struct sk_buff *skb;
2029        while ((skb = skb_dequeue(list)) != NULL)
2030                kfree_skb(skb);
2031}
2032EXPORT_SYMBOL(skb_queue_purge);
2033
2034/**
2035 *      skb_queue_head - queue a buffer at the list head
2036 *      @list: list to use
2037 *      @newsk: buffer to queue
2038 *
2039 *      Queue a buffer at the start of the list. This function takes the
2040 *      list lock and can be used safely with other locking &sk_buff functions
2041 *      safely.
2042 *
2043 *      A buffer cannot be placed on two lists at the same time.
2044 */
2045void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
2046{
2047        unsigned long flags;
2048
2049        spin_lock_irqsave(&list->lock, flags);
2050        __skb_queue_head(list, newsk);
2051        spin_unlock_irqrestore(&list->lock, flags);
2052}
2053EXPORT_SYMBOL(skb_queue_head);
2054
2055/**
2056 *      skb_queue_tail - queue a buffer at the list tail
2057 *      @list: list to use
2058 *      @newsk: buffer to queue
2059 *
2060 *      Queue a buffer at the tail of the list. This function takes the
2061 *      list lock and can be used safely with other locking &sk_buff functions
2062 *      safely.
2063 *
2064 *      A buffer cannot be placed on two lists at the same time.
2065 */
2066void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
2067{
2068        unsigned long flags;
2069
2070        spin_lock_irqsave(&list->lock, flags);
2071        __skb_queue_tail(list, newsk);
2072        spin_unlock_irqrestore(&list->lock, flags);
2073}
2074EXPORT_SYMBOL(skb_queue_tail);
2075
2076/**
2077 *      skb_unlink      -       remove a buffer from a list
2078 *      @skb: buffer to remove
2079 *      @list: list to use
2080 *
2081 *      Remove a packet from a list. The list locks are taken and this
2082 *      function is atomic with respect to other list locked calls
2083 *
2084 *      You must know what list the SKB is on.
2085 */
2086void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
2087{
2088        unsigned long flags;
2089
2090        spin_lock_irqsave(&list->lock, flags);
2091        __skb_unlink(skb, list);
2092        spin_unlock_irqrestore(&list->lock, flags);
2093}
2094EXPORT_SYMBOL(skb_unlink);
2095
2096/**
2097 *      skb_append      -       append a buffer
2098 *      @old: buffer to insert after
2099 *      @newsk: buffer to insert
2100 *      @list: list to use
2101 *
2102 *      Place a packet after a given packet in a list. The list locks are taken
2103 *      and this function is atomic with respect to other list locked calls.
2104 *      A buffer cannot be placed on two lists at the same time.
2105 */
2106void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
2107{
2108        unsigned long flags;
2109
2110        spin_lock_irqsave(&list->lock, flags);
2111        __skb_queue_after(list, old, newsk);
2112        spin_unlock_irqrestore(&list->lock, flags);
2113}
2114EXPORT_SYMBOL(skb_append);
2115
2116/**
2117 *      skb_insert      -       insert a buffer
2118 *      @old: buffer to insert before
2119 *      @newsk: buffer to insert
2120 *      @list: list to use
2121 *
2122 *      Place a packet before a given packet in a list. The list locks are
2123 *      taken and this function is atomic with respect to other list locked
2124 *      calls.
2125 *
2126 *      A buffer cannot be placed on two lists at the same time.
2127 */
2128void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
2129{
2130        unsigned long flags;
2131
2132        spin_lock_irqsave(&list->lock, flags);
2133        __skb_insert(newsk, old->prev, old, list);
2134        spin_unlock_irqrestore(&list->lock, flags);
2135}
2136EXPORT_SYMBOL(skb_insert);
2137
2138static inline void skb_split_inside_header(struct sk_buff *skb,
2139                                           struct sk_buff* skb1,
2140                                           const u32 len, const int pos)
2141{
2142        int i;
2143
2144        skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len),
2145                                         pos - len);
2146        /* And move data appendix as is. */
2147        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
2148                skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
2149
2150        skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
2151        skb_shinfo(skb)->nr_frags  = 0;
2152        skb1->data_len             = skb->data_len;
2153        skb1->len                  += skb1->data_len;
2154        skb->data_len              = 0;
2155        skb->len                   = len;
2156        skb_set_tail_pointer(skb, len);
2157}
2158
2159static inline void skb_split_no_header(struct sk_buff *skb,
2160                                       struct sk_buff* skb1,
2161                                       const u32 len, int pos)
2162{
2163        int i, k = 0;
2164        const int nfrags = skb_shinfo(skb)->nr_frags;
2165
2166        skb_shinfo(skb)->nr_frags = 0;
2167        skb1->len                 = skb1->data_len = skb->len - len;
2168        skb->len                  = len;
2169        skb->data_len             = len - pos;
2170
2171        for (i = 0; i < nfrags; i++) {
2172                int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
2173
2174                if (pos + size > len) {
2175                        skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
2176
2177                        if (pos < len) {
2178                                /* Split frag.
2179                                 * We have two variants in this case:
2180                                 * 1. Move all the frag to the second
2181                                 *    part, if it is possible. F.e.
2182                                 *    this approach is mandatory for TUX,
2183                                 *    where splitting is expensive.
2184                                 * 2. Split is accurately. We make this.
2185                                 */
2186                                skb_frag_ref(skb, i);
2187                                skb_shinfo(skb1)->frags[0].page_offset += len - pos;
2188                                skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos);
2189                                skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);
2190                                skb_shinfo(skb)->nr_frags++;
2191                        }
2192                        k++;
2193                } else
2194                        skb_shinfo(skb)->nr_frags++;
2195                pos += size;
2196        }
2197        skb_shinfo(skb1)->nr_frags = k;
2198}
2199
2200/**
2201 * skb_split - Split fragmented skb to two parts at length len.
2202 * @skb: the buffer to split
2203 * @skb1: the buffer to receive the second part
2204 * @len: new length for skb
2205 */
2206void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
2207{
2208        int pos = skb_headlen(skb);
2209
2210        if (len < pos)  /* Split line is inside header. */
2211                skb_split_inside_header(skb, skb1, len, pos);
2212        else            /* Second chunk has no header, nothing to copy. */
2213                skb_split_no_header(skb, skb1, len, pos);
2214}
2215EXPORT_SYMBOL(skb_split);
2216
2217/* Shifting from/to a cloned skb is a no-go.
2218 *
2219 * Caller cannot keep skb_shinfo related pointers past calling here!
2220 */
2221static int skb_prepare_for_shift(struct sk_buff *skb)
2222{
2223        return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2224}
2225
2226/**
2227 * skb_shift - Shifts paged data partially from skb to another
2228 * @tgt: buffer into which tail data gets added
2229 * @skb: buffer from which the paged data comes from
2230 * @shiftlen: shift up to this many bytes
2231 *
2232 * Attempts to shift up to shiftlen worth of bytes, which may be less than
2233 * the length of the skb, from skb to tgt. Returns number bytes shifted.
2234 * It's up to caller to free skb if everything was shifted.
2235 *
2236 * If @tgt runs out of frags, the whole operation is aborted.
2237 *
2238 * Skb cannot include anything else but paged data while tgt is allowed
2239 * to have non-paged data as well.
2240 *
2241 * TODO: full sized shift could be optimized but that would need
2242 * specialized skb free'er to handle frags without up-to-date nr_frags.
2243 */
2244int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
2245{
2246        int from, to, merge, todo;
2247        struct skb_frag_struct *fragfrom, *fragto;
2248
2249        BUG_ON(shiftlen > skb->len);
2250        BUG_ON(skb_headlen(skb));       /* Would corrupt stream */
2251
2252        todo = shiftlen;
2253        from = 0;
2254        to = skb_shinfo(tgt)->nr_frags;
2255        fragfrom = &skb_shinfo(skb)->frags[from];
2256
2257        /* Actual merge is delayed until the point when we know we can
2258         * commit all, so that we don't have to undo partial changes
2259         */
2260        if (!to ||
2261            !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
2262                              fragfrom->page_offset)) {
2263                merge = -1;
2264        } else {
2265                merge = to - 1;
2266
2267                todo -= skb_frag_size(fragfrom);
2268                if (todo < 0) {
2269                        if (skb_prepare_for_shift(skb) ||
2270                            skb_prepare_for_shift(tgt))
2271                                return 0;
2272
2273                        /* All previous frag pointers might be stale! */
2274                        fragfrom = &skb_shinfo(skb)->frags[from];
2275                        fragto = &skb_shinfo(tgt)->frags[merge];
2276
2277                        skb_frag_size_add(fragto, shiftlen);
2278                        skb_frag_size_sub(fragfrom, shiftlen);
2279                        fragfrom->page_offset += shiftlen;
2280
2281                        goto onlymerged;
2282                }
2283
2284                from++;
2285        }
2286
2287        /* Skip full, not-fitting skb to avoid expensive operations */
2288        if ((shiftlen == skb->len) &&
2289            (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
2290                return 0;
2291
2292        if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
2293                return 0;
2294
2295        while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
2296                if (to == MAX_SKB_FRAGS)
2297                        return 0;
2298
2299                fragfrom = &skb_shinfo(skb)->frags[from];
2300                fragto = &skb_shinfo(tgt)->frags[to];
2301
2302                if (todo >= skb_frag_size(fragfrom)) {
2303                        *fragto = *fragfrom;
2304                        todo -= skb_frag_size(fragfrom);
2305                        from++;
2306                        to++;
2307
2308                } else {
2309                        __skb_frag_ref(fragfrom);
2310                        fragto->page = fragfrom->page;
2311                        fragto->page_offset = fragfrom->page_offset;
2312                        skb_frag_size_set(fragto, todo);
2313
2314                        fragfrom->page_offset += todo;
2315                        skb_frag_size_sub(fragfrom, todo);
2316                        todo = 0;
2317
2318                        to++;
2319                        break;
2320                }
2321        }
2322
2323        /* Ready to "commit" this state change to tgt */
2324        skb_shinfo(tgt)->nr_frags = to;
2325
2326        if (merge >= 0) {
2327                fragfrom = &skb_shinfo(skb)->frags[0];
2328                fragto = &skb_shinfo(tgt)->frags[merge];
2329
2330                skb_frag_size_add(fragto, skb_frag_size(fragfrom));
2331                __skb_frag_unref(fragfrom);
2332        }
2333
2334        /* Reposition in the original skb */
2335        to = 0;
2336        while (from < skb_shinfo(skb)->nr_frags)
2337                skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
2338        skb_shinfo(skb)->nr_frags = to;
2339
2340        BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);
2341
2342onlymerged:
2343        /* Most likely the tgt won't ever need its checksum anymore, skb on
2344         * the other hand might need it if it needs to be resent
2345         */
2346        tgt->ip_summed = CHECKSUM_PARTIAL;
2347        skb->ip_summed = CHECKSUM_PARTIAL;
2348
2349        /* Yak, is it really working this way? Some helper please? */
2350        skb->len -= shiftlen;
2351        skb->data_len -= shiftlen;
2352        skb->truesize -= shiftlen;
2353        tgt->len += shiftlen;
2354        tgt->data_len += shiftlen;
2355        tgt->truesize += shiftlen;
2356
2357        return shiftlen;
2358}
2359
2360/**
2361 * skb_prepare_seq_read - Prepare a sequential read of skb data
2362 * @skb: the buffer to read
2363 * @from: lower offset of data to be read
2364 * @to: upper offset of data to be read
2365 * @st: state variable
2366 *
2367 * Initializes the specified state variable. Must be called before
2368 * invoking skb_seq_read() for the first time.
2369 */
2370void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
2371                          unsigned int to, struct skb_seq_state *st)
2372{
2373        st->lower_offset = from;
2374        st->upper_offset = to;
2375        st->root_skb = st->cur_skb = skb;
2376        st->frag_idx = st->stepped_offset = 0;
2377        st->frag_data = NULL;
2378}
2379EXPORT_SYMBOL(skb_prepare_seq_read);
2380
2381/**
2382 * skb_seq_read - Sequentially read skb data
2383 * @consumed: number of bytes consumed by the caller so far
2384 * @data: destination pointer for data to be returned
2385 * @st: state variable
2386 *
2387 * Reads a block of skb data at &consumed relative to the
2388 * lower offset specified to skb_prepare_seq_read(). Assigns
2389 * the head of the data block to &data and returns the length
2390 * of the block or 0 if the end of the skb data or the upper
2391 * offset has been reached.
2392 *
2393 * The caller is not required to consume all of the data
2394 * returned, i.e. &consumed is typically set to the number
2395 * of bytes already consumed and the next call to
2396 * skb_seq_read() will return the remaining part of the block.
2397 *
2398 * Note 1: The size of each block of data returned can be arbitrary,
2399 *       this limitation is the cost for zerocopy seqeuental
2400 *       reads of potentially non linear data.
2401 *
2402 * Note 2: Fragment lists within fragments are not implemented
2403 *       at the moment, state->root_skb could be replaced with
2404 *       a stack for this purpose.
2405 */
2406unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
2407                          struct skb_seq_state *st)
2408{
2409        unsigned int block_limit, abs_offset = consumed + st->lower_offset;
2410        skb_frag_t *frag;
2411
2412        if (unlikely(abs_offset >= st->upper_offset))
2413                return 0;
2414
2415next_skb:
2416        block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;
2417
2418        if (abs_offset < block_limit && !st->frag_data) {
2419                *data = st->cur_skb->data + (abs_offset - st->stepped_offset);
2420                return block_limit - abs_offset;
2421        }
2422
2423        if (st->frag_idx == 0 && !st->frag_data)
2424                st->stepped_offset += skb_headlen(st->cur_skb);
2425
2426        while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
2427                frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
2428                block_limit = skb_frag_size(frag) + st->stepped_offset;
2429
2430                if (abs_offset < block_limit) {
2431                        if (!st->frag_data)
2432                                st->frag_data = kmap_skb_frag(frag);
2433
2434                        *data = (u8 *) st->frag_data + frag->page_offset +
2435                                (abs_offset - st->stepped_offset);
2436
2437                        return block_limit - abs_offset;
2438                }
2439
2440                if (st->frag_data) {
2441                        kunmap_skb_frag(st->frag_data);
2442                        st->frag_data = NULL;
2443                }
2444
2445                st->frag_idx++;
2446                st->stepped_offset += skb_frag_size(frag);
2447        }
2448
2449        if (st->frag_data) {
2450                kunmap_skb_frag(st->frag_data);
2451                st->frag_data = NULL;
2452        }
2453
2454        if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
2455                st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
2456                st->frag_idx = 0;
2457                goto next_skb;
2458        } else if (st->cur_skb->next) {
2459                st->cur_skb = st->cur_skb->next;
2460                st->frag_idx = 0;
2461                goto next_skb;
2462        }
2463
2464        return 0;
2465}
2466EXPORT_SYMBOL(skb_seq_read);
2467
2468/**
2469 * skb_abort_seq_read - Abort a sequential read of skb data
2470 * @st: state variable
2471 *
2472 * Must be called if skb_seq_read() was not called until it
2473 * returned 0.
2474 */
2475void skb_abort_seq_read(struct skb_seq_state *st)
2476{
2477        if (st->frag_data)
2478                kunmap_skb_frag(st->frag_data);
2479}
2480EXPORT_SYMBOL(skb_abort_seq_read);
2481
2482#define TS_SKB_CB(state)        ((struct skb_seq_state *) &((state)->cb))
2483
2484static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
2485                                          struct ts_config *conf,
2486                                          struct ts_state *state)
2487{
2488        return skb_seq_read(offset, text, TS_SKB_CB(state));
2489}
2490
2491static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
2492{
2493        skb_abort_seq_read(TS_SKB_CB(state));
2494}
2495
2496/**
2497 * skb_find_text - Find a text pattern in skb data
2498 * @skb: the buffer to look in
2499 * @from: search offset
2500 * @to: search limit
2501 * @config: textsearch configuration
2502 * @state: uninitialized textsearch state variable
2503 *
2504 * Finds a pattern in the skb data according to the specified
2505 * textsearch configuration. Use textsearch_next() to retrieve
2506 * subsequent occurrences of the pattern. Returns the offset
2507 * to the first occurrence or UINT_MAX if no match was found.
2508 */
2509unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
2510                           unsigned int to, struct ts_config *config,
2511                           struct ts_state *state)
2512{
2513        unsigned int ret;
2514
2515        config->get_next_block = skb_ts_get_next_block;
2516        config->finish = skb_ts_finish;
2517
2518        skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state));
2519
2520        ret = textsearch_find(config, state);
2521        return (ret <= to - from ? ret : UINT_MAX);
2522}
2523EXPORT_SYMBOL(skb_find_text);
2524
2525/**
2526 * skb_append_datato_frags: - append the user data to a skb
2527 * @sk: sock  structure
2528 * @skb: skb structure to be appened with user data.
2529 * @getfrag: call back function to be used for getting the user data
2530 * @from: pointer to user message iov
2531 * @length: length of the iov message
2532 *
2533 * Description: This procedure append the user data in the fragment part
2534 * of the skb if any page alloc fails user this procedure returns  -ENOMEM
2535 */
2536int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
2537                        int (*getfrag)(void *from, char *to, int offset,
2538                                        int len, int odd, struct sk_buff *skb),
2539                        void *from, int length)
2540{
2541        int frg_cnt = 0;
2542        skb_frag_t *frag = NULL;
2543        struct page *page = NULL;
2544        int copy, left;
2545        int offset = 0;
2546        int ret;
2547
2548        do {
2549                /* Return error if we don't have space for new frag */
2550                frg_cnt = skb_shinfo(skb)->nr_frags;
2551                if (frg_cnt >= MAX_SKB_FRAGS)
2552                        return -EFAULT;
2553
2554                /* allocate a new page for next frag */
2555                page = alloc_pages(sk->sk_allocation, 0);
2556
2557                /* If alloc_page fails just return failure and caller will
2558                 * free previous allocated pages by doing kfree_skb()
2559                 */
2560                if (page == NULL)
2561                        return -ENOMEM;
2562
2563                /* initialize the next frag */
2564                skb_fill_page_desc(skb, frg_cnt, page, 0, 0);
2565                skb->truesize += PAGE_SIZE;
2566                atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
2567
2568                /* get the new initialized frag */
2569                frg_cnt = skb_shinfo(skb)->nr_frags;
2570                frag = &skb_shinfo(skb)->frags[frg_cnt - 1];
2571
2572                /* copy the user data to page */
2573                left = PAGE_SIZE - frag->page_offset;
2574                copy = (length > left)? left : length;
2575
2576                ret = getfrag(from, skb_frag_address(frag) + skb_frag_size(frag),
2577                            offset, copy, 0, skb);
2578                if (ret < 0)
2579                        return -EFAULT;
2580
2581                /* copy was successful so update the size parameters */
2582                skb_frag_size_add(frag, copy);
2583                skb->len += copy;
2584                skb->data_len += copy;
2585                offset += copy;
2586                length -= copy;
2587
2588        } while (length > 0);
2589
2590        return 0;
2591}
2592EXPORT_SYMBOL(skb_append_datato_frags);
2593
2594/**
2595 *      skb_pull_rcsum - pull skb and update receive checksum
2596 *      @skb: buffer to update
2597 *      @len: length of data pulled
2598 *
2599 *      This function performs an skb_pull on the packet and updates
2600 *      the CHECKSUM_COMPLETE checksum.  It should be used on
2601 *      receive path processing instead of skb_pull unless you know
2602 *      that the checksum difference is zero (e.g., a valid IP header)
2603 *      or you are setting ip_summed to CHECKSUM_NONE.
2604 */
2605unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
2606{
2607        BUG_ON(len > skb->len);
2608        skb->len -= len;
2609        BUG_ON(skb->len < skb->data_len);
2610        skb_postpull_rcsum(skb, skb->data, len);
2611        return skb->data += len;
2612}
2613EXPORT_SYMBOL_GPL(skb_pull_rcsum);
2614
2615/**
2616 *      skb_segment - Perform protocol segmentation on skb.
2617 *      @skb: buffer to segment
2618 *      @features: features for the output path (see dev->features)
2619 *
2620 *      This function performs segmentation on the given skb.  It returns
2621 *      a pointer to the first in a list of new skbs for the segments.
2622 *      In case of error it returns ERR_PTR(err).
2623 */
2624struct sk_buff *skb_segment(struct sk_buff *skb, u32 features)
2625{
2626        struct sk_buff *segs = NULL;
2627        struct sk_buff *tail = NULL;
2628        struct sk_buff *fskb = skb_shinfo(skb)->frag_list;
2629        unsigned int mss = skb_shinfo(skb)->gso_size;
2630        unsigned int doffset = skb->data - skb_mac_header(skb);
2631        unsigned int offset = doffset;
2632        unsigned int headroom;
2633        unsigned int len;
2634        int sg = !!(features & NETIF_F_SG);
2635        int nfrags = skb_shinfo(skb)->nr_frags;
2636        int err = -ENOMEM;
2637        int i = 0;
2638        int pos;
2639
2640        __skb_push(skb, doffset);
2641        headroom = skb_headroom(skb);
2642        pos = skb_headlen(skb);
2643
2644        do {
2645                struct sk_buff *nskb;
2646                skb_frag_t *frag;
2647                int hsize;
2648                int size;
2649
2650                len = skb->len - offset;
2651                if (len > mss)
2652                        len = mss;
2653
2654                hsize = skb_headlen(skb) - offset;
2655                if (hsize < 0)
2656                        hsize = 0;
2657                if (hsize > len || !sg)
2658                        hsize = len;
2659
2660                if (!hsize && i >= nfrags) {
2661                        BUG_ON(fskb->len != len);
2662
2663                        pos += len;
2664                        nskb = skb_clone(fskb, GFP_ATOMIC);
2665                        fskb = fskb->next;
2666
2667                        if (unlikely(!nskb))
2668                                goto err;
2669
2670                        hsize = skb_end_pointer(nskb) - nskb->head;
2671                        if (skb_cow_head(nskb, doffset + headroom)) {
2672                                kfree_skb(nskb);
2673                                goto err;
2674                        }
2675
2676                        nskb->truesize += skb_end_pointer(nskb) - nskb->head -
2677                                          hsize;
2678                        skb_release_head_state(nskb);
2679                        __skb_push(nskb, doffset);
2680                } else {
2681                        nskb = alloc_skb(hsize + doffset + headroom,
2682                                         GFP_ATOMIC);
2683
2684                        if (unlikely(!nskb))
2685                                goto err;
2686
2687                        skb_reserve(nskb, headroom);
2688                        __skb_put(nskb, doffset);
2689                }
2690
2691                if (segs)
2692                        tail->next = nskb;
2693                else
2694                        segs = nskb;
2695                tail = nskb;
2696
2697                __copy_skb_header(nskb, skb);
2698                nskb->mac_len = skb->mac_len;
2699
2700                /* nskb and skb might have different headroom */
2701                if (nskb->ip_summed == CHECKSUM_PARTIAL)
2702                        nskb->csum_start += skb_headroom(nskb) - headroom;
2703
2704                skb_reset_mac_header(nskb);
2705                skb_set_network_header(nskb, skb->mac_len);
2706                nskb->transport_header = (nskb->network_header +
2707                                          skb_network_header_len(skb));
2708                skb_copy_from_linear_data(skb, nskb->data, doffset);
2709
2710                if (fskb != skb_shinfo(skb)->frag_list)
2711                        continue;
2712
2713                if (!sg) {
2714                        nskb->ip_summed = CHECKSUM_NONE;
2715                        nskb->csum = skb_copy_and_csum_bits(skb, offset,
2716                                                            skb_put(nskb, len),
2717                                                            len, 0);
2718                        continue;
2719                }
2720
2721                frag = skb_shinfo(nskb)->frags;
2722
2723                skb_copy_from_linear_data_offset(skb, offset,
2724                                                 skb_put(nskb, hsize), hsize);
2725
2726                while (pos < offset + len && i < nfrags) {
2727                        *frag = skb_shinfo(skb)->frags[i];
2728                        __skb_frag_ref(frag);
2729                        size = skb_frag_size(frag);
2730
2731                        if (pos < offset) {
2732                                frag->page_offset += offset - pos;
2733                                skb_frag_size_sub(frag, offset - pos);
2734                        }
2735
2736                        skb_shinfo(nskb)->nr_frags++;
2737
2738                        if (pos + size <= offset + len) {
2739                                i++;
2740                                pos += size;
2741                        } else {
2742                                skb_frag_size_sub(frag, pos + size - (offset + len));
2743                                goto skip_fraglist;
2744                        }
2745
2746                        frag++;
2747                }
2748
2749                if (pos < offset + len) {
2750                        struct sk_buff *fskb2 = fskb;
2751
2752                        BUG_ON(pos + fskb->len != offset + len);
2753
2754                        pos += fskb->len;
2755                        fskb = fskb->next;
2756
2757                        if (fskb2->next) {
2758                                fskb2 = skb_clone(fskb2, GFP_ATOMIC);
2759                                if (!fskb2)
2760                                        goto err;
2761                        } else
2762                                skb_get(fskb2);
2763
2764                        SKB_FRAG_ASSERT(nskb);
2765                        skb_shinfo(nskb)->frag_list = fskb2;
2766                }
2767
2768skip_fraglist:
2769                nskb->data_len = len - hsize;
2770                nskb->len += nskb->data_len;
2771                nskb->truesize += nskb->data_len;
2772        } while ((offset += len) < skb->len);
2773
2774        return segs;
2775
2776err:
2777        while ((skb = segs)) {
2778                segs = skb->next;
2779                kfree_skb(skb);
2780        }
2781        return ERR_PTR(err);
2782}
2783EXPORT_SYMBOL_GPL(skb_segment);
2784
2785int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2786{
2787        struct sk_buff *p = *head;
2788        struct sk_buff *nskb;
2789        struct skb_shared_info *skbinfo = skb_shinfo(skb);
2790        struct skb_shared_info *pinfo = skb_shinfo(p);
2791        unsigned int headroom;
2792        unsigned int len = skb_gro_len(skb);
2793        unsigned int offset = skb_gro_offset(skb);
2794        unsigned int headlen = skb_headlen(skb);
2795
2796        if (p->len + len >= 65536)
2797                return -E2BIG;
2798
2799        if (pinfo->frag_list)
2800                goto merge;
2801        else if (headlen <= offset) {
2802                skb_frag_t *frag;
2803                skb_frag_t *frag2;
2804                int i = skbinfo->nr_frags;
2805                int nr_frags = pinfo->nr_frags + i;
2806
2807                offset -= headlen;
2808
2809                if (nr_frags > MAX_SKB_FRAGS)
2810                        return -E2BIG;
2811
2812                pinfo->nr_frags = nr_frags;
2813                skbinfo->nr_frags = 0;
2814
2815                frag = pinfo->frags + nr_frags;
2816                frag2 = skbinfo->frags + i;
2817                do {
2818                        *--frag = *--frag2;
2819                } while (--i);
2820
2821                frag->page_offset += offset;
2822                skb_frag_size_sub(frag, offset);
2823
2824                skb->truesize -= skb->data_len;
2825                skb->len -= skb->data_len;
2826                skb->data_len = 0;
2827
2828                NAPI_GRO_CB(skb)->free = 1;
2829                goto done;
2830        } else if (skb_gro_len(p) != pinfo->gso_size)
2831                return -E2BIG;
2832
2833        headroom = skb_headroom(p);
2834        nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC);
2835        if (unlikely(!nskb))
2836                return -ENOMEM;
2837
2838        __copy_skb_header(nskb, p);
2839        nskb->mac_len = p->mac_len;
2840
2841        skb_reserve(nskb, headroom);
2842        __skb_put(nskb, skb_gro_offset(p));
2843
2844        skb_set_mac_header(nskb, skb_mac_header(p) - p->data);
2845        skb_set_network_header(nskb, skb_network_offset(p));
2846        skb_set_transport_header(nskb, skb_transport_offset(p));
2847
2848        __skb_pull(p, skb_gro_offset(p));
2849        memcpy(skb_mac_header(nskb), skb_mac_header(p),
2850               p->data - skb_mac_header(p));
2851
2852        *NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p);
2853        skb_shinfo(nskb)->frag_list = p;
2854        skb_shinfo(nskb)->gso_size = pinfo->gso_size;
2855        pinfo->gso_size = 0;
2856        skb_header_release(p);
2857        nskb->prev = p;
2858
2859        nskb->data_len += p->len;
2860        nskb->truesize += p->len;
2861        nskb->len += p->len;
2862
2863        *head = nskb;
2864        nskb->next = p->next;
2865        p->next = NULL;
2866
2867        p = nskb;
2868
2869merge:
2870        if (offset > headlen) {
2871                unsigned int eat = offset - headlen;
2872
2873                skbinfo->frags[0].page_offset += eat;
2874                skb_frag_size_sub(&skbinfo->frags[0], eat);
2875                skb->data_len -= eat;
2876                skb->len -= eat;
2877                offset = headlen;
2878        }
2879
2880        __skb_pull(skb, offset);
2881
2882        p->prev->next = skb;
2883        p->prev = skb;
2884        skb_header_release(skb);
2885
2886done:
2887        NAPI_GRO_CB(p)->count++;
2888        p->data_len += len;
2889        p->truesize += len;
2890        p->len += len;
2891
2892        NAPI_GRO_CB(skb)->same_flow = 1;
2893        return 0;
2894}
2895EXPORT_SYMBOL_GPL(skb_gro_receive);
2896
2897void __init skb_init(void)
2898{
2899        skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
2900                                              sizeof(struct sk_buff),
2901                                              0,
2902                                              SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2903                                              NULL);
2904        skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
2905                                                (2*sizeof(struct sk_buff)) +
2906                                                sizeof(atomic_t),
2907                                                0,
2908                                                SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2909                                                NULL);
2910}
2911
2912/**
2913 *      skb_to_sgvec - Fill a scatter-gather list from a socket buffer
2914 *      @skb: Socket buffer containing the buffers to be mapped
2915 *      @sg: The scatter-gather list to map into
2916 *      @offset: The offset into the buffer's contents to start mapping
2917 *      @len: Length of buffer space to be mapped
2918 *
2919 *      Fill the specified scatter-gather list with mappings/pointers into a
2920 *      region of the buffer space attached to a socket buffer.
2921 */
2922static int
2923__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
2924{
2925        int start = skb_headlen(skb);
2926        int i, copy = start - offset;
2927        struct sk_buff *frag_iter;
2928        int elt = 0;
2929
2930        if (copy > 0) {
2931                if (copy > len)
2932                        copy = len;
2933                sg_set_buf(sg, skb->data + offset, copy);
2934                elt++;
2935                if ((len -= copy) == 0)
2936                        return elt;
2937                offset += copy;
2938        }
2939
2940        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2941                int end;
2942
2943                WARN_ON(start > offset + len);
2944
2945                end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
2946                if ((copy = end - offset) > 0) {
2947                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2948
2949                        if (copy > len)
2950                                copy = len;
2951                        sg_set_page(&sg[elt], skb_frag_page(frag), copy,
2952                                        frag->page_offset+offset-start);
2953                        elt++;
2954                        if (!(len -= copy))
2955                                return elt;
2956                        offset += copy;
2957                }
2958                start = end;
2959        }
2960
2961        skb_walk_frags(skb, frag_iter) {
2962                int end;
2963
2964                WARN_ON(start > offset + len);
2965
2966                end = start + frag_iter->len;
2967                if ((copy = end - offset) > 0) {
2968                        if (copy > len)
2969                                copy = len;
2970                        elt += __skb_to_sgvec(frag_iter, sg+elt, offset - start,
2971                                              copy);
2972                        if ((len -= copy) == 0)
2973                                return elt;
2974                        offset += copy;
2975                }
2976                start = end;
2977        }
2978        BUG_ON(len);
2979        return elt;
2980}
2981
2982int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
2983{
2984        int nsg = __skb_to_sgvec(skb, sg, offset, len);
2985
2986        sg_mark_end(&sg[nsg - 1]);
2987
2988        return nsg;
2989}
2990EXPORT_SYMBOL_GPL(skb_to_sgvec);
2991
2992/**
2993 *      skb_cow_data - Check that a socket buffer's data buffers are writable
2994 *      @skb: The socket buffer to check.
2995 *      @tailbits: Amount of trailing space to be added
2996 *      @trailer: Returned pointer to the skb where the @tailbits space begins
2997 *
2998 *      Make sure that the data buffers attached to a socket buffer are
2999 *      writable. If they are not, private copies are made of the data buffers
3000 *      and the socket buffer is set to use these instead.
3001 *
3002 *      If @tailbits is given, make sure that there is space to write @tailbits
3003 *      bytes of data beyond current end of socket buffer.  @trailer will be
3004 *      set to point to the skb in which this space begins.
3005 *
3006 *      The number of scatterlist elements required to completely map the
3007 *      COW'd and extended socket buffer will be returned.
3008 */
3009int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
3010{
3011        int copyflag;
3012        int elt;
3013        struct sk_buff *skb1, **skb_p;
3014
3015        /* If skb is cloned or its head is paged, reallocate
3016         * head pulling out all the pages (pages are considered not writable
3017         * at the moment even if they are anonymous).
3018         */
3019        if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
3020            __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL)
3021                return -ENOMEM;
3022
3023        /* Easy case. Most of packets will go this way. */
3024        if (!skb_has_frag_list(skb)) {
3025                /* A little of trouble, not enough of space for trailer.
3026                 * This should not happen, when stack is tuned to generate
3027                 * good frames. OK, on miss we reallocate and reserve even more
3028                 * space, 128 bytes is fair. */
3029
3030                if (skb_tailroom(skb) < tailbits &&
3031                    pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC))
3032                        return -ENOMEM;
3033
3034                /* Voila! */
3035                *trailer = skb;
3036                return 1;
3037        }
3038
3039        /* Misery. We are in troubles, going to mincer fragments... */
3040
3041        elt = 1;
3042        skb_p = &skb_shinfo(skb)->frag_list;
3043        copyflag = 0;
3044
3045        while ((skb1 = *skb_p) != NULL) {
3046                int ntail = 0;
3047
3048                /* The fragment is partially pulled by someone,
3049                 * this can happen on input. Copy it and everything
3050                 * after it. */
3051
3052                if (skb_shared(skb1))
3053                        copyflag = 1;
3054
3055                /* If the skb is the last, worry about trailer. */
3056
3057                if (skb1->next == NULL && tailbits) {
3058                        if (skb_shinfo(skb1)->nr_frags ||
3059                            skb_has_frag_list(skb1) ||
3060                            skb_tailroom(skb1) < tailbits)
3061                                ntail = tailbits + 128;
3062                }
3063
3064                if (copyflag ||
3065                    skb_cloned(skb1) ||
3066                    ntail ||
3067                    skb_shinfo(skb1)->nr_frags ||
3068                    skb_has_frag_list(skb1)) {
3069                        struct sk_buff *skb2;
3070
3071                        /* Fuck, we are miserable poor guys... */
3072                        if (ntail == 0)
3073                                skb2 = skb_copy(skb1, GFP_ATOMIC);
3074                        else
3075                                skb2 = skb_copy_expand(skb1,
3076                                                       skb_headroom(skb1),
3077                                                       ntail,
3078                                                       GFP_ATOMIC);
3079                        if (unlikely(skb2 == NULL))
3080                                return -ENOMEM;
3081
3082                        if (skb1->sk)
3083                                skb_set_owner_w(skb2, skb1->sk);
3084
3085                        /* Looking around. Are we still alive?
3086                         * OK, link new skb, drop old one */
3087
3088                        skb2->next = skb1->next;
3089                        *skb_p = skb2;
3090                        kfree_skb(skb1);
3091                        skb1 = skb2;
3092                }
3093                elt++;
3094                *trailer = skb1;
3095                skb_p = &skb1->next;
3096        }
3097
3098        return elt;
3099}
3100EXPORT_SYMBOL_GPL(skb_cow_data);
3101
3102static void sock_rmem_free(struct sk_buff *skb)
3103{
3104        struct sock *sk = skb->sk;
3105
3106        atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
3107}
3108
3109/*
3110 * Note: We dont mem charge error packets (no sk_forward_alloc changes)
3111 */
3112int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
3113{
3114        if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
3115            (unsigned)sk->sk_rcvbuf)
3116                return -ENOMEM;
3117
3118        skb_orphan(skb);
3119        skb->sk = sk;
3120        skb->destructor = sock_rmem_free;
3121        atomic_add(skb->truesize, &sk->sk_rmem_alloc);
3122
3123        /* before exiting rcu section, make sure dst is refcounted */
3124        skb_dst_force(skb);
3125
3126        skb_queue_tail(&sk->sk_error_queue, skb);
3127        if (!sock_flag(sk, SOCK_DEAD))
3128                sk->sk_data_ready(sk, skb->len);
3129        return 0;
3130}
3131EXPORT_SYMBOL(sock_queue_err_skb);
3132
3133void skb_tstamp_tx(struct sk_buff *orig_skb,
3134                struct skb_shared_hwtstamps *hwtstamps)
3135{
3136        struct sock *sk = orig_skb->sk;
3137        struct sock_exterr_skb *serr;
3138        struct sk_buff *skb;
3139        int err;
3140
3141        if (!sk)
3142                return;
3143
3144        skb = skb_clone(orig_skb, GFP_ATOMIC);
3145        if (!skb)
3146                return;
3147
3148        if (hwtstamps) {
3149                *skb_hwtstamps(skb) =
3150                        *hwtstamps;
3151        } else {
3152                /*
3153                 * no hardware time stamps available,
3154                 * so keep the shared tx_flags and only
3155                 * store software time stamp
3156                 */
3157                skb->tstamp = ktime_get_real();
3158        }
3159
3160        serr = SKB_EXT_ERR(skb);
3161        memset(serr, 0, sizeof(*serr));
3162        serr->ee.ee_errno = ENOMSG;
3163        serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
3164
3165        err = sock_queue_err_skb(sk, skb);
3166
3167        if (err)
3168                kfree_skb(skb);
3169}
3170EXPORT_SYMBOL_GPL(skb_tstamp_tx);
3171
3172
3173/**
3174 * skb_partial_csum_set - set up and verify partial csum values for packet
3175 * @skb: the skb to set
3176 * @start: the number of bytes after skb->data to start checksumming.
3177 * @off: the offset from start to place the checksum.
3178 *
3179 * For untrusted partially-checksummed packets, we need to make sure the values
3180 * for skb->csum_start and skb->csum_offset are valid so we don't oops.
3181 *
3182 * This function checks and sets those values and skb->ip_summed: if this
3183 * returns false you should drop the packet.
3184 */
3185bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
3186{
3187        if (unlikely(start > skb_headlen(skb)) ||
3188            unlikely((int)start + off > skb_headlen(skb) - 2)) {
3189                if (net_ratelimit())
3190                        printk(KERN_WARNING
3191                               "bad partial csum: csum=%u/%u len=%u\n",
3192                               start, off, skb_headlen(skb));
3193                return false;
3194        }
3195        skb->ip_summed = CHECKSUM_PARTIAL;
3196        skb->csum_start = skb_headroom(skb) + start;
3197        skb->csum_offset = off;
3198        return true;
3199}
3200EXPORT_SYMBOL_GPL(skb_partial_csum_set);
3201
3202void __skb_warn_lro_forwarding(const struct sk_buff *skb)
3203{
3204        if (net_ratelimit())
3205                pr_warning("%s: received packets cannot be forwarded"
3206                           " while LRO is enabled\n", skb->dev->name);
3207}
3208EXPORT_SYMBOL(__skb_warn_lro_forwarding);
3209
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.