linux-old/net/ipv4/ip_fragment.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              The IP fragmentation functionality.
   7 *              
   8 * Version:     $Id: ip_fragment.c,v 1.40 1999/03/20 23:58:34 davem Exp $
   9 *
  10 * Authors:     Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
  11 *              Alan Cox <Alan.Cox@linux.org>
  12 *
  13 * Fixes:
  14 *              Alan Cox        :       Split from ip.c , see ip_input.c for history.
  15 *              David S. Miller :       Begin massive cleanup...
  16 *              Andi Kleen      :       Add sysctls.
  17 *              xxxx            :       Overlapfrag bug.
  18 *              Ultima          :       ip_expire() kernel panic.
  19 *              Bill Hawes      :       Frag accounting and evictor fixes.
  20 *              John McDonald   :       0 length frag bug.
  21 */
  22
  23#include <linux/types.h>
  24#include <linux/mm.h>
  25#include <linux/sched.h>
  26#include <linux/skbuff.h>
  27#include <linux/ip.h>
  28#include <linux/icmp.h>
  29#include <linux/netdevice.h>
  30#include <net/sock.h>
  31#include <net/ip.h>
  32#include <net/icmp.h>
  33#include <linux/tcp.h>
  34#include <linux/udp.h>
  35#include <linux/inet.h>
  36#include <linux/firewall.h>
  37#include <linux/ip_fw.h>
  38
  39/* Fragment cache limits. We will commit 256K at one time. Should we
  40 * cross that limit we will prune down to 192K. This should cope with
  41 * even the most extreme cases without allowing an attacker to measurably
  42 * harm machine performance.
  43 */
  44int sysctl_ipfrag_high_thresh = 256*1024;
  45int sysctl_ipfrag_low_thresh = 192*1024;
  46
  47int sysctl_ipfrag_time = IP_FRAG_TIME;
  48
  49/* Describe an IP fragment. */
  50struct ipfrag {
  51        int             offset;         /* offset of fragment in IP datagram    */
  52        int             end;            /* last byte of data in datagram        */
  53        int             len;            /* length of this fragment              */
  54        struct sk_buff  *skb;           /* complete received fragment           */
  55        unsigned char   *ptr;           /* pointer into real fragment data      */
  56        struct ipfrag   *next;          /* linked list pointers                 */
  57        struct ipfrag   *prev;
  58};
  59
  60/* Describe an entry in the "incomplete datagrams" queue. */
  61struct ipq {
  62        struct iphdr    *iph;           /* pointer to IP header                 */
  63        struct ipq      *next;          /* linked list pointers                 */
  64        struct ipfrag   *fragments;     /* linked list of received fragments    */
  65        int             len;            /* total length of original datagram    */
  66        short           ihlen;          /* length of the IP header              */      
  67        struct timer_list timer;        /* when will this queue expire?         */
  68        struct ipq      **pprev;
  69        struct device   *dev;           /* Device - for icmp replies */
  70};
  71
  72#define IPQ_HASHSZ      64
  73
  74struct ipq *ipq_hash[IPQ_HASHSZ];
  75
  76#define ipqhashfn(id, saddr, daddr, prot) \
  77        ((((id) >> 1) ^ (saddr) ^ (daddr) ^ (prot)) & (IPQ_HASHSZ - 1))
  78
  79atomic_t ip_frag_mem = ATOMIC_INIT(0);          /* Memory used for fragments */
  80
  81/* Memory Tracking Functions. */
  82extern __inline__ void frag_kfree_skb(struct sk_buff *skb)
  83{
  84        atomic_sub(skb->truesize, &ip_frag_mem);
  85        kfree_skb(skb);
  86}
  87
  88extern __inline__ void frag_kfree_s(void *ptr, int len)
  89{
  90        atomic_sub(len, &ip_frag_mem);
  91        kfree(ptr);
  92}
  93 
  94extern __inline__ void *frag_kmalloc(int size, int pri)
  95{
  96        void *vp = kmalloc(size, pri);
  97
  98        if(!vp)
  99                return NULL;
 100        atomic_add(size, &ip_frag_mem);
 101        return vp;
 102}
 103 
 104/* Create a new fragment entry. */
 105static struct ipfrag *ip_frag_create(int offset, int end,
 106                                     struct sk_buff *skb, unsigned char *ptr)
 107{
 108        struct ipfrag *fp;
 109
 110        fp = (struct ipfrag *) frag_kmalloc(sizeof(struct ipfrag), GFP_ATOMIC);
 111        if (fp == NULL)
 112                goto out_nomem;
 113
 114        /* Fill in the structure. */
 115        fp->offset = offset;
 116        fp->end = end;
 117        fp->len = end - offset;
 118        fp->skb = skb;
 119        fp->ptr = ptr;
 120        fp->next = fp->prev = NULL;
 121        
 122        /* Charge for the SKB as well. */
 123        atomic_add(skb->truesize, &ip_frag_mem);
 124
 125        return(fp);
 126
 127out_nomem:
 128        NETDEBUG(printk(KERN_ERR "IP: frag_create: no memory left !\n"));
 129        return(NULL);
 130}
 131
 132/* Find the correct entry in the "incomplete datagrams" queue for
 133 * this IP datagram, and return the queue entry address if found.
 134 */
 135static inline struct ipq *ip_find(struct iphdr *iph, struct dst_entry *dst)
 136{
 137        __u16 id = iph->id;
 138        __u32 saddr = iph->saddr;
 139        __u32 daddr = iph->daddr;
 140        __u8 protocol = iph->protocol;
 141        unsigned int hash = ipqhashfn(id, saddr, daddr, protocol);
 142        struct ipq *qp;
 143
 144        /* Always, we are in a BH context, so no locking.  -DaveM */
 145        for(qp = ipq_hash[hash]; qp; qp = qp->next) {
 146                if(qp->iph->id == id            &&
 147                   qp->iph->saddr == saddr      &&
 148                   qp->iph->daddr == daddr      &&
 149                   qp->iph->protocol == protocol) {
 150                        del_timer(&qp->timer);
 151                        break;
 152                }
 153        }
 154        return qp;
 155}
 156
 157/* Remove an entry from the "incomplete datagrams" queue, either
 158 * because we completed, reassembled and processed it, or because
 159 * it timed out.
 160 *
 161 * This is called _only_ from BH contexts, on packet reception
 162 * processing and from frag queue expiration timers.  -DaveM
 163 */
 164static void ip_free(struct ipq *qp)
 165{
 166        struct ipfrag *fp;
 167
 168        /* Stop the timer for this entry. */
 169        del_timer(&qp->timer);
 170
 171        /* Remove this entry from the "incomplete datagrams" queue. */
 172        if(qp->next)
 173                qp->next->pprev = qp->pprev;
 174        *qp->pprev = qp->next;
 175
 176        /* Release all fragment data. */
 177        fp = qp->fragments;
 178        while (fp) {
 179                struct ipfrag *xp = fp->next;
 180
 181                frag_kfree_skb(fp->skb);
 182                frag_kfree_s(fp, sizeof(struct ipfrag));
 183                fp = xp;
 184        }
 185
 186        /* Release the IP header. */
 187        frag_kfree_s(qp->iph, 64 + 8);
 188
 189        /* Finally, release the queue descriptor itself. */
 190        frag_kfree_s(qp, sizeof(struct ipq));
 191}
 192
 193/*
 194 * Oops, a fragment queue timed out.  Kill it and send an ICMP reply.
 195 */
 196static void ip_expire(unsigned long arg)
 197{
 198        struct ipq *qp = (struct ipq *) arg;
 199
 200        if(!qp->fragments)
 201        {       
 202#ifdef IP_EXPIRE_DEBUG
 203                printk("warning: possible ip-expire attack\n");
 204#endif
 205                goto out;
 206        }
 207  
 208        /* Send an ICMP "Fragment Reassembly Timeout" message. */
 209        ip_statistics.IpReasmTimeout++;
 210        ip_statistics.IpReasmFails++;   
 211        icmp_send(qp->fragments->skb, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
 212
 213out:
 214        /* Nuke the fragment queue. */
 215        ip_free(qp);
 216}
 217
 218/* Memory limiting on fragments.  Evictor trashes the oldest 
 219 * fragment queue until we are back under the low threshold.
 220 */
 221static void ip_evictor(void)
 222{
 223        int i, progress;
 224
 225restart:
 226        progress = 0;
 227        /* FIXME: Make LRU queue of frag heads. -DaveM */
 228        for (i = 0; i < IPQ_HASHSZ; i++) {
 229                struct ipq *qp;
 230                if (atomic_read(&ip_frag_mem) <= sysctl_ipfrag_low_thresh)
 231                        return;
 232                /* We are in a BH context, so these queue
 233                 * accesses are safe.  -DaveM
 234                 */
 235                qp = ipq_hash[i];
 236                if (qp) {
 237                        /* find the oldest queue for this hash bucket */
 238                        while (qp->next)
 239                                qp = qp->next;
 240                        ip_free(qp);
 241                        progress = 1;
 242                }
 243        }
 244        if (progress)
 245                goto restart;
 246        panic("ip_evictor: memcount");
 247}
 248
 249/* Add an entry to the 'ipq' queue for a newly received IP datagram.
 250 * We will (hopefully :-) receive all other fragments of this datagram
 251 * in time, so we just create a queue for this datagram, in which we
 252 * will insert the received fragments at their respective positions.
 253 */
 254static struct ipq *ip_create(struct sk_buff *skb, struct iphdr *iph)
 255{
 256        struct ipq *qp;
 257        unsigned int hash;
 258        int ihlen;
 259
 260        qp = (struct ipq *) frag_kmalloc(sizeof(struct ipq), GFP_ATOMIC);
 261        if (qp == NULL)
 262                goto out_nomem;
 263
 264        /* Allocate memory for the IP header (plus 8 octets for ICMP). */
 265        ihlen = iph->ihl * 4;
 266
 267        qp->iph = (struct iphdr *) frag_kmalloc(64 + 8, GFP_ATOMIC);
 268        if (qp->iph == NULL)
 269                goto out_free;
 270
 271        memcpy(qp->iph, iph, ihlen + 8);
 272        qp->len = 0;
 273        qp->ihlen = ihlen;
 274        qp->fragments = NULL;
 275        qp->dev = skb->dev;
 276
 277        /* Initialize a timer for this entry. */
 278        init_timer(&qp->timer);
 279        qp->timer.expires = 0;                  /* (to be set later)    */
 280        qp->timer.data = (unsigned long) qp;    /* pointer to queue     */
 281        qp->timer.function = ip_expire;         /* expire function      */
 282
 283        /* Add this entry to the queue. */
 284        hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
 285
 286        /* We are in a BH context, no locking necessary.  -DaveM */
 287        if((qp->next = ipq_hash[hash]) != NULL)
 288                qp->next->pprev = &qp->next;
 289        ipq_hash[hash] = qp;
 290        qp->pprev = &ipq_hash[hash];
 291
 292        return qp;
 293
 294out_free:
 295        frag_kfree_s(qp, sizeof(struct ipq));
 296out_nomem:
 297        NETDEBUG(printk(KERN_ERR "IP: create: no memory left !\n"));
 298        return(NULL);
 299}
 300
 301/* See if a fragment queue is complete. */
 302static int ip_done(struct ipq *qp)
 303{
 304        struct ipfrag *fp;
 305        int offset;
 306
 307        /* Only possible if we received the final fragment. */
 308        if (qp->len == 0)
 309                return 0;
 310
 311        /* Check all fragment offsets to see if they connect. */
 312        fp = qp->fragments;
 313        offset = 0;
 314        while (fp) {
 315                if (fp->offset > offset)
 316                        return(0);      /* fragment(s) missing */
 317                offset = fp->end;
 318                fp = fp->next;
 319        }
 320
 321        /* All fragments are present. */
 322        return 1;
 323}
 324
 325/* Build a new IP datagram from all its fragments.
 326 *
 327 * FIXME: We copy here because we lack an effective way of handling lists
 328 * of bits on input. Until the new skb data handling is in I'm not going
 329 * to touch this with a bargepole. 
 330 */
 331static struct sk_buff *ip_glue(struct ipq *qp)
 332{
 333        struct sk_buff *skb;
 334        struct iphdr *iph;
 335        struct ipfrag *fp;
 336        unsigned char *ptr;
 337        int count, len;
 338
 339        /* Allocate a new buffer for the datagram. */
 340        len = qp->ihlen + qp->len;
 341        
 342        if(len > 65535)
 343                goto out_oversize;
 344        
 345        skb = dev_alloc_skb(len);
 346        if (!skb)
 347                goto out_nomem;
 348
 349        /* Fill in the basic details. */
 350        skb->mac.raw = ptr = skb->data;
 351        skb->nh.iph = iph = (struct iphdr *) skb_put(skb, len);
 352
 353        /* Copy the original IP headers into the new buffer. */
 354        memcpy(ptr, qp->iph, qp->ihlen);
 355        ptr += qp->ihlen;
 356
 357        /* Copy the data portions of all fragments into the new buffer. */
 358        fp = qp->fragments;
 359        count = qp->ihlen;
 360        while(fp) {
 361                if ((fp->len <= 0) || ((count + fp->len) > skb->len))
 362                        goto out_invalid;
 363                memcpy((ptr + fp->offset), fp->ptr, fp->len);
 364                if (count == qp->ihlen) {
 365                        skb->dst = dst_clone(fp->skb->dst);
 366                        skb->dev = fp->skb->dev;
 367                }
 368                count += fp->len;
 369                fp = fp->next;
 370        }
 371
 372        skb->pkt_type = qp->fragments->skb->pkt_type;
 373        skb->protocol = qp->fragments->skb->protocol;
 374        /*
 375        *  Clearly bogus, because security markings of the individual
 376        *  fragments should have been checked for consistency before
 377        *  gluing, and intermediate coalescing of fragments may have
 378        *  taken place in ip_defrag() before ip_glue() ever got called.
 379        *  If we're not going to do the consistency checking, we might
 380        *  as well take the value associated with the first fragment.
 381        *       --rct
 382        */
 383        skb->security = qp->fragments->skb->security;
 384
 385        /* Done with all fragments. Fixup the new IP header. */
 386        iph = skb->nh.iph;
 387        iph->frag_off = 0;
 388        iph->tot_len = htons(count);
 389        ip_statistics.IpReasmOKs++;
 390        return skb;
 391
 392out_invalid:
 393        NETDEBUG(printk(KERN_ERR
 394                        "Invalid fragment list: Fragment over size.\n"));
 395        kfree_skb(skb);
 396        goto out_fail;
 397out_nomem:
 398        NETDEBUG(printk(KERN_ERR 
 399                        "IP: queue_glue: no memory for gluing queue %p\n",
 400                        qp));
 401        goto out_fail;
 402out_oversize:
 403        if (net_ratelimit())
 404                printk(KERN_INFO
 405                        "Oversized IP packet from %d.%d.%d.%d.\n",
 406                        NIPQUAD(qp->iph->saddr));
 407out_fail:
 408        ip_statistics.IpReasmFails++;
 409        return NULL;
 410}
 411
 412/* Process an incoming IP datagram fragment. */
 413struct sk_buff *ip_defrag(struct sk_buff *skb)
 414{
 415        struct iphdr *iph = skb->nh.iph;
 416        struct ipfrag *prev, *next, *tmp, *tfp;
 417        struct ipq *qp;
 418        unsigned char *ptr;
 419        int flags, offset;
 420        int i, ihl, end;
 421        
 422        ip_statistics.IpReasmReqds++;
 423
 424        /* Start by cleaning up the memory. */
 425        if (atomic_read(&ip_frag_mem) > sysctl_ipfrag_high_thresh)
 426                ip_evictor();
 427
 428        /*
 429         * Look for the entry for this IP datagram in the
 430         * "incomplete datagrams" queue. If found, the
 431         * timer is removed.
 432         */
 433        qp = ip_find(iph, skb->dst);
 434
 435        /* Is this a non-fragmented datagram? */
 436        offset = ntohs(iph->frag_off);
 437        flags = offset & ~IP_OFFSET;
 438        offset &= IP_OFFSET;
 439
 440        offset <<= 3;           /* offset is in 8-byte chunks */
 441        ihl = iph->ihl * 4;
 442
 443        /*
 444         * Check whether to create a fresh queue entry. If the
 445         * queue already exists, its timer will be restarted as
 446         * long as we continue to receive fragments.
 447         */
 448        if (qp) {
 449                /* ANK. If the first fragment is received,
 450                 * we should remember the correct IP header (with options)
 451                 */
 452                if (offset == 0) {
 453                        /* Fragmented frame replaced by unfragmented copy? */
 454                        if ((flags & IP_MF) == 0)
 455                                goto out_freequeue;
 456                        qp->ihlen = ihl;
 457                        memcpy(qp->iph, iph, (ihl + 8));
 458                }
 459        } else {
 460                /* Fragmented frame replaced by unfragmented copy? */
 461                if ((offset == 0) && ((flags & IP_MF) == 0))
 462                        goto out_skb;
 463
 464                /* If we failed to create it, then discard the frame. */
 465                qp = ip_create(skb, iph);
 466                if (!qp)
 467                        goto out_freeskb;
 468        }
 469        
 470        /* Attempt to construct an oversize packet. */
 471        if((ntohs(iph->tot_len) + ((int) offset)) > 65535)
 472                goto out_oversize;
 473
 474        /* Determine the position of this fragment. */
 475        end = offset + ntohs(iph->tot_len) - ihl;
 476
 477        /* Is this the final fragment? */
 478        if ((flags & IP_MF) == 0)
 479                qp->len = end;
 480
 481        /* Find out which fragments are in front and at the back of us
 482         * in the chain of fragments so far.  We must know where to put
 483         * this fragment, right?
 484         */
 485        prev = NULL;
 486        for(next = qp->fragments; next != NULL; next = next->next) {
 487                if (next->offset >= offset)
 488                        break;  /* bingo! */
 489                prev = next;
 490        }
 491
 492        /* Point into the IP datagram 'data' part. */
 493        ptr = skb->data + ihl;
 494
 495        /* We found where to put this one.  Check for overlap with
 496         * preceding fragment, and, if needed, align things so that
 497         * any overlaps are eliminated.
 498         */
 499        if ((prev != NULL) && (offset < prev->end)) {
 500                i = prev->end - offset;
 501                offset += i;    /* ptr into datagram */
 502                ptr += i;       /* ptr into fragment data */
 503        }
 504
 505        /* Look for overlap with succeeding segments.
 506         * If we can merge fragments, do it.
 507         */
 508        for (tmp = next; tmp != NULL; tmp = tfp) {
 509                tfp = tmp->next;
 510                if (tmp->offset >= end)
 511                        break;          /* no overlaps at all   */
 512
 513                i = end - next->offset; /* overlap is 'i' bytes */
 514                tmp->len -= i;          /* so reduce size of    */
 515                tmp->offset += i;       /* next fragment        */
 516                tmp->ptr += i;
 517
 518                /* If we get a frag size of <= 0, remove it and the packet
 519                 * that it goes with.
 520                 */
 521                if (tmp->len <= 0) {
 522                        if (tmp->prev != NULL)
 523                                tmp->prev->next = tmp->next;
 524                        else
 525                                qp->fragments = tmp->next;
 526
 527                        if (tmp->next != NULL)
 528                                tmp->next->prev = tmp->prev;
 529                        
 530                        /* We have killed the original next frame. */
 531                        next = tfp;
 532
 533                        frag_kfree_skb(tmp->skb);
 534                        frag_kfree_s(tmp, sizeof(struct ipfrag));
 535                }
 536        }
 537
 538        /*
 539         * Create a fragment to hold this skb.
 540         * No memory to save the fragment? throw the lot ...
 541         */
 542        tfp = ip_frag_create(offset, end, skb, ptr);
 543        if (!tfp)
 544                goto out_freeskb;
 545
 546        /* Insert this fragment in the chain of fragments. */
 547        tfp->prev = prev;
 548        tfp->next = next;
 549        if (prev != NULL)
 550                prev->next = tfp;
 551        else
 552                qp->fragments = tfp;
 553
 554        if (next != NULL)
 555                next->prev = tfp;
 556
 557        /* OK, so we inserted this new fragment into the chain.
 558         * Check if we now have a full IP datagram which we can
 559         * bump up to the IP layer...
 560         */
 561        if (ip_done(qp)) {
 562                /* Glue together the fragments. */
 563                skb = ip_glue(qp);
 564                /* Free the queue entry. */
 565out_freequeue:
 566                ip_free(qp);
 567out_skb:
 568                return skb;
 569        }
 570
 571        /*
 572         * The queue is still active ... reset its timer.
 573         */
 574out_timer:
 575        mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time); /* ~ 30 seconds */
 576out:
 577        return NULL;
 578
 579        /*
 580         * Error exits ... we need to reset the timer if there's a queue.
 581         */
 582out_oversize:
 583        if (net_ratelimit())
 584                printk(KERN_INFO "Oversized packet received from %d.%d.%d.%d\n",
 585                        NIPQUAD(iph->saddr));
 586        /* the skb isn't in a fragment, so fall through to free it */
 587out_freeskb:
 588        kfree_skb(skb);
 589        ip_statistics.IpReasmFails++;
 590        if (qp)
 591                goto out_timer;
 592        goto out;
 593}
 594
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.