linux-old/net/ipv4/tcp_input.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 * Version:     $Id: tcp_input.c,v 1.159 1999/03/17 19:30:39 davem Exp $
   9 *
  10 * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14 *              Florian La Roche, <flla@stud.uni-sb.de>
  15 *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16 *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18 *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19 *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20 *              Jorge Cwik, <jorge@laser.satlink.net>
  21 */
  22
  23/*
  24 * Changes:
  25 *              Pedro Roque     :       Fast Retransmit/Recovery.
  26 *                                      Two receive queues.
  27 *                                      Retransmit queue handled by TCP.
  28 *                                      Better retransmit timer handling.
  29 *                                      New congestion avoidance.
  30 *                                      Header prediction.
  31 *                                      Variable renaming.
  32 *
  33 *              Eric            :       Fast Retransmit.
  34 *              Randy Scott     :       MSS option defines.
  35 *              Eric Schenk     :       Fixes to slow start algorithm.
  36 *              Eric Schenk     :       Yet another double ACK bug.
  37 *              Eric Schenk     :       Delayed ACK bug fixes.
  38 *              Eric Schenk     :       Floyd style fast retrans war avoidance.
  39 *              David S. Miller :       Don't allow zero congestion window.
  40 *              Eric Schenk     :       Fix retransmitter so that it sends
  41 *                                      next packet on ack of previous packet.
  42 *              Andi Kleen      :       Moved open_request checking here
  43 *                                      and process RSTs for open_requests.
  44 *              Andi Kleen      :       Better prune_queue, and other fixes.
  45 *              Andrey Savochkin:       Fix RTT measurements in the presnce of
  46 *                                      timestamps.
  47 *              Andrey Savochkin:       Check sequence numbers correctly when
  48 *                                      removing SACKs due to in sequence incoming
  49 *                                      data segments.
  50 *              Andi Kleen:             Make sure we never ack data there is not
  51 *                                      enough room for. Also make this condition
  52 *                                      a fatal error if it might still happen.
  53 *              Andi Kleen:             Add tcp_measure_rcv_mss to make 
  54 *                                      connections with MSS<min(MTU,ann. MSS)
  55 *                                      work without delayed acks. 
  56 *              Andi Kleen:             Process packets with PSH set in the
  57 *                                      fast path.
  58 */
  59
  60#include <linux/config.h>
  61#include <linux/mm.h>
  62#include <linux/sysctl.h>
  63#include <net/tcp.h>
  64#include <linux/ipsec.h>
  65
  66#ifdef CONFIG_SYSCTL
  67#define SYNC_INIT 0 /* let the user enable it */
  68#else
  69#define SYNC_INIT 1
  70#endif
  71
  72extern int sysctl_tcp_fin_timeout;
  73
  74/* These are on by default so the code paths get tested.
  75 * For the final 2.2 this may be undone at our discretion. -DaveM
  76 */
  77int sysctl_tcp_timestamps = 1;
  78int sysctl_tcp_window_scaling = 1;
  79int sysctl_tcp_sack = 1;
  80
  81int sysctl_tcp_syncookies = SYNC_INIT; 
  82int sysctl_tcp_stdurg;
  83int sysctl_tcp_rfc1337;
  84
  85static int prune_queue(struct sock *sk);
  86
  87/* There is something which you must keep in mind when you analyze the
  88 * behavior of the tp->ato delayed ack timeout interval.  When a
  89 * connection starts up, we want to ack as quickly as possible.  The
  90 * problem is that "good" TCP's do slow start at the beginning of data
  91 * transmission.  The means that until we send the first few ACK's the
  92 * sender will sit on his end and only queue most of his data, because
  93 * he can only send snd_cwnd unacked packets at any given time.  For
  94 * each ACK we send, he increments snd_cwnd and transmits more of his
  95 * queue.  -DaveM
  96 */
  97static void tcp_delack_estimator(struct tcp_opt *tp)
  98{
  99        if(tp->ato == 0) {
 100                tp->lrcvtime = jiffies;
 101
 102                /* Help sender leave slow start quickly,
 103                 * and also makes sure we do not take this
 104                 * branch ever again for this connection.
 105                 */
 106                tp->ato = 1;
 107                tcp_enter_quickack_mode(tp);
 108        } else {
 109                int m = jiffies - tp->lrcvtime;
 110
 111                tp->lrcvtime = jiffies;
 112                if(m <= 0)
 113                        m = 1;
 114                if(m > tp->rto)
 115                        tp->ato = tp->rto;
 116                else {
 117                        /* This funny shift makes sure we
 118                         * clear the "quick ack mode" bit.
 119                         */
 120                        tp->ato = ((tp->ato << 1) >> 2) + m;
 121                }
 122        }
 123}
 124
 125/* 
 126 * Remember to send an ACK later.
 127 */
 128static __inline__ void tcp_remember_ack(struct tcp_opt *tp, struct tcphdr *th, 
 129                                        struct sk_buff *skb)
 130{
 131        tp->delayed_acks++; 
 132
 133        /* Tiny-grams with PSH set artifically deflate our
 134         * ato measurement, but with a lower bound.
 135         */
 136        if(th->psh && (skb->len < (tp->mss_cache >> 1))) {
 137                /* Preserve the quickack state. */
 138                if((tp->ato & 0x7fffffff) > HZ/50)
 139                        tp->ato = ((tp->ato & 0x80000000) |
 140                                   (HZ/50));
 141        }
 142} 
 143
 144/* Called to compute a smoothed rtt estimate. The data fed to this
 145 * routine either comes from timestamps, or from segments that were
 146 * known _not_ to have been retransmitted [see Karn/Partridge
 147 * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
 148 * piece by Van Jacobson.
 149 * NOTE: the next three routines used to be one big routine.
 150 * To save cycles in the RFC 1323 implementation it was better to break
 151 * it up into three procedures. -- erics
 152 */
 153
 154static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
 155{
 156        long m = mrtt; /* RTT */
 157
 158        /*      The following amusing code comes from Jacobson's
 159         *      article in SIGCOMM '88.  Note that rtt and mdev
 160         *      are scaled versions of rtt and mean deviation.
 161         *      This is designed to be as fast as possible 
 162         *      m stands for "measurement".
 163         *
 164         *      On a 1990 paper the rto value is changed to:
 165         *      RTO = rtt + 4 * mdev
 166         */
 167        if(m == 0)
 168                m = 1;
 169        if (tp->srtt != 0) {
 170                m -= (tp->srtt >> 3);   /* m is now error in rtt est */
 171                tp->srtt += m;          /* rtt = 7/8 rtt + 1/8 new */
 172                if (m < 0)
 173                        m = -m;         /* m is now abs(error) */
 174                m -= (tp->mdev >> 2);   /* similar update on mdev */
 175                tp->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
 176        } else {
 177                /* no previous measure. */
 178                tp->srtt = m<<3;        /* take the measured time to be rtt */
 179                tp->mdev = m<<2;        /* make sure rto = 3*rtt */
 180        }
 181}
 182
 183/* Calculate rto without backoff.  This is the second half of Van Jacobson's
 184 * routine referred to above.
 185 */
 186
 187static __inline__ void tcp_set_rto(struct tcp_opt *tp)
 188{
 189        tp->rto = (tp->srtt >> 3) + tp->mdev;
 190        tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1));
 191}
 192 
 193
 194/* Keep the rto between HZ/5 and 120*HZ. 120*HZ is the upper bound
 195 * on packet lifetime in the internet. We need the HZ/5 lower
 196 * bound to behave correctly against BSD stacks with a fixed
 197 * delayed ack.
 198 * FIXME: It's not entirely clear this lower bound is the best
 199 * way to avoid the problem. Is it possible to drop the lower
 200 * bound and still avoid trouble with BSD stacks? Perhaps
 201 * some modification to the RTO calculation that takes delayed
 202 * ack bias into account? This needs serious thought. -- erics
 203 */
 204static __inline__ void tcp_bound_rto(struct tcp_opt *tp)
 205{
 206        if (tp->rto > 120*HZ)
 207                tp->rto = 120*HZ;
 208        if (tp->rto < HZ/5)
 209                tp->rto = HZ/5;
 210}
 211
 212/* WARNING: this must not be called if tp->saw_timestamp was false. */
 213extern __inline__ void tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp,
 214                                             __u32 start_seq, __u32 end_seq)
 215{
 216        /* From draft-ietf-tcplw-high-performance: the correct
 217         * test is last_ack_sent <= end_seq.
 218         * (RFC1323 stated last_ack_sent < end_seq.)
 219         *
 220         * HOWEVER: The current check contradicts the draft statements.
 221         * It has been done for good reasons.
 222         * The implemented check improves security and eliminates
 223         * unnecessary RTT overestimation.
 224         *              1998/06/27  Andrey V. Savochkin <saw@msu.ru>
 225         */
 226        if (!before(end_seq, tp->last_ack_sent - sk->rcvbuf) &&
 227            !after(start_seq, tp->rcv_wup + tp->rcv_wnd)) {
 228                /* PAWS bug workaround wrt. ACK frames, the PAWS discard
 229                 * extra check below makes sure this can only happen
 230                 * for pure ACK frames.  -DaveM
 231                 */
 232                if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0) {
 233                        tp->ts_recent = tp->rcv_tsval;
 234                        tp->ts_recent_stamp = jiffies;
 235                }
 236        }
 237}
 238
 239#define PAWS_24DAYS     (HZ * 60 * 60 * 24 * 24)
 240
 241extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct tcphdr *th, unsigned len)
 242{
 243        /* ts_recent must be younger than 24 days */
 244        return (((s32)(jiffies - tp->ts_recent_stamp) >= PAWS_24DAYS) ||
 245                (((s32)(tp->rcv_tsval - tp->ts_recent) < 0) &&
 246                 /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM */
 247                 (len != (th->doff * 4))));
 248}
 249
 250
 251static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
 252{
 253        u32 end_window = tp->rcv_wup + tp->rcv_wnd;
 254
 255        if (tp->rcv_wnd &&
 256            after(end_seq, tp->rcv_nxt) &&
 257            before(seq, end_window))
 258                return 1;
 259        if (seq != end_window)
 260                return 0;
 261        return (seq == end_seq);
 262}
 263
 264/* This functions checks to see if the tcp header is actually acceptable. */
 265extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
 266{
 267        if (seq == tp->rcv_nxt)
 268                return (tp->rcv_wnd || (end_seq == seq));
 269
 270        return __tcp_sequence(tp, seq, end_seq);
 271}
 272
 273/* When we get a reset we do this. */
 274static void tcp_reset(struct sock *sk)
 275{
 276        sk->zapped = 1;
 277
 278        /* We want the right error as BSD sees it (and indeed as we do). */
 279        switch (sk->state) {
 280                case TCP_SYN_SENT:
 281                        sk->err = ECONNREFUSED;
 282                        break;
 283                case TCP_CLOSE_WAIT:
 284                        sk->err = EPIPE;
 285                        break;
 286                default:
 287                        sk->err = ECONNRESET;
 288        };
 289        tcp_set_state(sk, TCP_CLOSE);
 290        sk->shutdown = SHUTDOWN_MASK;
 291        if (!sk->dead) 
 292                sk->state_change(sk);
 293}
 294
 295/* This tags the retransmission queue when SACKs arrive. */
 296static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, int nsacks)
 297{
 298        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 299        int i = nsacks;
 300
 301        while(i--) {
 302                struct sk_buff *skb = skb_peek(&sk->write_queue);
 303                __u32 start_seq = ntohl(sp->start_seq);
 304                __u32 end_seq = ntohl(sp->end_seq);
 305                int fack_count = 0;
 306
 307                while((skb != NULL) &&
 308                      (skb != tp->send_head) &&
 309                      (skb != (struct sk_buff *)&sk->write_queue)) {
 310                        /* The retransmission queue is always in order, so
 311                         * we can short-circuit the walk early.
 312                         */
 313                        if(after(TCP_SKB_CB(skb)->seq, end_seq))
 314                                break;
 315
 316                        /* We play conservative, we don't allow SACKS to partially
 317                         * tag a sequence space.
 318                         */
 319                        fack_count++;
 320                        if(!after(start_seq, TCP_SKB_CB(skb)->seq) &&
 321                           !before(end_seq, TCP_SKB_CB(skb)->end_seq)) {
 322                                /* If this was a retransmitted frame, account for it. */
 323                                if((TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) &&
 324                                   tp->retrans_out)
 325                                        tp->retrans_out--;
 326                                TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
 327
 328                                /* RULE: All new SACKs will either decrease retrans_out
 329                                 *       or advance fackets_out.
 330                                 */
 331                                if(fack_count > tp->fackets_out)
 332                                        tp->fackets_out = fack_count;
 333                        }
 334                        skb = skb->next;
 335                }
 336                sp++; /* Move on to the next SACK block. */
 337        }
 338}
 339
 340/* Look for tcp options. Normally only called on SYN and SYNACK packets.
 341 * But, this can also be called on packets in the established flow when
 342 * the fast version below fails.
 343 */
 344void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, int no_fancy)
 345{
 346        unsigned char *ptr;
 347        int length=(th->doff*4)-sizeof(struct tcphdr);
 348        int saw_mss = 0;
 349
 350        ptr = (unsigned char *)(th + 1);
 351        tp->saw_tstamp = 0;
 352
 353        while(length>0) {
 354                int opcode=*ptr++;
 355                int opsize;
 356
 357                switch (opcode) {
 358                        case TCPOPT_EOL:
 359                                return;
 360                        case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
 361                                length--;
 362                                continue;
 363                        default:
 364                                opsize=*ptr++;
 365                                if (opsize < 2) /* "silly options" */
 366                                        return;
 367                                if (opsize > length)
 368                                        break;  /* don't parse partial options */
 369                                switch(opcode) {
 370                                case TCPOPT_MSS:
 371                                        if(opsize==TCPOLEN_MSS && th->syn) {
 372                                                u16 in_mss = ntohs(*(__u16 *)ptr);
 373                                                if (in_mss == 0)
 374                                                        in_mss = 536;
 375                                                if (tp->mss_clamp > in_mss)
 376                                                        tp->mss_clamp = in_mss;
 377                                                saw_mss = 1;
 378                                        }
 379                                        break;
 380                                case TCPOPT_WINDOW:
 381                                        if(opsize==TCPOLEN_WINDOW && th->syn)
 382                                                if (!no_fancy && sysctl_tcp_window_scaling) {
 383                                                        tp->wscale_ok = 1;
 384                                                        tp->snd_wscale = *(__u8 *)ptr;
 385                                                        if(tp->snd_wscale > 14) {
 386                                                                if(net_ratelimit())
 387                                                                        printk("tcp_parse_options: Illegal window "
 388                                                                               "scaling value %d >14 received.",
 389                                                                               tp->snd_wscale);
 390                                                                tp->snd_wscale = 14;
 391                                                        }
 392                                                }
 393                                        break;
 394                                case TCPOPT_TIMESTAMP:
 395                                        if(opsize==TCPOLEN_TIMESTAMP) {
 396                                                if (sysctl_tcp_timestamps && !no_fancy) {
 397                                                        tp->tstamp_ok = 1;
 398                                                        tp->saw_tstamp = 1;
 399                                                        tp->rcv_tsval = ntohl(*(__u32 *)ptr);
 400                                                        tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4));
 401                                                }
 402                                        }
 403                                        break;
 404                                case TCPOPT_SACK_PERM:
 405                                        if(opsize==TCPOLEN_SACK_PERM && th->syn) {
 406                                                if (sysctl_tcp_sack && !no_fancy) {
 407                                                        tp->sack_ok = 1;
 408                                                        tp->num_sacks = 0;
 409                                                }
 410                                        }
 411                                        break;
 412
 413                                case TCPOPT_SACK:
 414                                        if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
 415                                           sysctl_tcp_sack && (sk != NULL) && !th->syn) {
 416                                                int sack_bytes = opsize - TCPOLEN_SACK_BASE;
 417
 418                                                if(!(sack_bytes % TCPOLEN_SACK_PERBLOCK)) {
 419                                                        int num_sacks = sack_bytes >> 3;
 420                                                        struct tcp_sack_block *sackp;
 421
 422                                                        sackp = (struct tcp_sack_block *)ptr;
 423                                                        tcp_sacktag_write_queue(sk, sackp, num_sacks);
 424                                                }
 425                                        }
 426                                };
 427                                ptr+=opsize-2;
 428                                length-=opsize;
 429                };
 430        }
 431        if(th->syn && saw_mss == 0)
 432                tp->mss_clamp = 536;
 433}
 434
 435/* Fast parse options. This hopes to only see timestamps.
 436 * If it is wrong it falls back on tcp_parse_options().
 437 */
 438static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp)
 439{
 440        /* If we didn't send out any options ignore them all. */
 441        if (tp->tcp_header_len == sizeof(struct tcphdr))
 442                return 0;
 443        if (th->doff == sizeof(struct tcphdr)>>2) {
 444                tp->saw_tstamp = 0;
 445                return 0;
 446        } else if (th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
 447                __u32 *ptr = (__u32 *)(th + 1);
 448                if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
 449                                             | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
 450                        tp->saw_tstamp = 1;
 451                        tp->rcv_tsval = ntohl(*++ptr);
 452                        tp->rcv_tsecr = ntohl(*++ptr);
 453                        return 1;
 454                }
 455        }
 456        tcp_parse_options(sk, th, tp, 0);
 457        return 1;
 458}
 459
 460#define FLAG_DATA               0x01 /* Incoming frame contained data.          */
 461#define FLAG_WIN_UPDATE         0x02 /* Incoming ACK was a window update.       */
 462#define FLAG_DATA_ACKED         0x04 /* This ACK acknowledged new data.         */
 463#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted.  */
 464
 465static __inline__ void clear_fast_retransmit(struct tcp_opt *tp)
 466{
 467        if (tp->dup_acks > 3)
 468                tp->snd_cwnd = (tp->snd_ssthresh);
 469
 470        tp->dup_acks = 0;
 471}
 472
 473/* NOTE: This code assumes that tp->dup_acks gets cleared when a
 474 * retransmit timer fires.
 475 */
 476static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
 477{
 478        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 479
 480        /* Note: If not_dup is set this implies we got a
 481         * data carrying packet or a window update.
 482         * This carries no new information about possible
 483         * lost packets, so we have to ignore it for the purposes
 484         * of counting duplicate acks. Ideally this does not imply we
 485         * should stop our fast retransmit phase, more acks may come
 486         * later without data to help us. Unfortunately this would make
 487         * the code below much more complex. For now if I see such
 488         * a packet I clear the fast retransmit phase.
 489         */
 490        if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) {
 491                /* This is the standard reno style fast retransmit branch. */
 492
 493                /* 1. When the third duplicate ack is received, set ssthresh 
 494                 * to one half the current congestion window, but no less 
 495                 * than two segments. Retransmit the missing segment.
 496                 */
 497                if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
 498                        tp->dup_acks++;
 499                        if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) {
 500                                tp->snd_ssthresh =
 501                                        max(min(tp->snd_wnd, tp->snd_cwnd) >> 1, 2);
 502                                tp->snd_cwnd = (tp->snd_ssthresh + 3);
 503                                tp->high_seq = tp->snd_nxt;
 504                                if(!tp->fackets_out)
 505                                        tcp_retransmit_skb(sk,
 506                                                           skb_peek(&sk->write_queue));
 507                                else
 508                                        tcp_fack_retransmit(sk);
 509                                tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 510                        }
 511                } else if (++tp->dup_acks > 3) {
 512                        /* 2. Each time another duplicate ACK arrives, increment 
 513                         * cwnd by the segment size. [...] Transmit a packet...
 514                         *
 515                         * Packet transmission will be done on normal flow processing
 516                         * since we're not in "retransmit mode".  We do not use
 517                         * duplicate ACKs to artificially inflate the congestion
 518                         * window when doing FACK.
 519                         */
 520                        if(!tp->fackets_out) {
 521                                tp->snd_cwnd++;
 522                        } else {
 523                                /* Fill any further holes which may have
 524                                 * appeared.
 525                                 *
 526                                 * We may want to change this to run every
 527                                 * further multiple-of-3 dup ack increments,
 528                                 * to be more robust against out-of-order
 529                                 * packet delivery.  -DaveM
 530                                 */
 531                                tcp_fack_retransmit(sk);
 532                        }
 533                }
 534        } else if (tp->high_seq != 0) {
 535                /* In this branch we deal with clearing the Floyd style
 536                 * block on duplicate fast retransmits, and if requested
 537                 * we do Hoe style secondary fast retransmits.
 538                 */
 539                if (!before(ack, tp->high_seq) || (not_dup & FLAG_DATA) != 0) {
 540                        /* Once we have acked all the packets up to high_seq
 541                         * we are done this fast retransmit phase.
 542                         * Alternatively data arrived. In this case we
 543                         * Have to abort the fast retransmit attempt.
 544                         * Note that we do want to accept a window
 545                         * update since this is expected with Hoe's algorithm.
 546                         */
 547                        clear_fast_retransmit(tp);
 548
 549                        /* After we have cleared up to high_seq we can
 550                         * clear the Floyd style block.
 551                         */
 552                        if (!before(ack, tp->high_seq)) {
 553                                tp->high_seq = 0;
 554                                tp->fackets_out = 0;
 555                        }
 556                } else if (tp->dup_acks >= 3) {
 557                        if (!tp->fackets_out) {
 558                                /* Hoe Style. We didn't ack the whole
 559                                 * window. Take this as a cue that
 560                                 * another packet was lost and retransmit it.
 561                                 * Don't muck with the congestion window here.
 562                                 * Note that we have to be careful not to
 563                                 * act if this was a window update and it
 564                                 * didn't ack new data, since this does
 565                                 * not indicate a packet left the system.
 566                                 * We can test this by just checking
 567                                 * if ack changed from snd_una, since
 568                                 * the only way to get here without advancing
 569                                 * from snd_una is if this was a window update.
 570                                 */
 571                                if (ack != tp->snd_una && before(ack, tp->high_seq)) {
 572                                        tcp_retransmit_skb(sk,
 573                                                           skb_peek(&sk->write_queue));
 574                                        tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 575                                }
 576                        } else {
 577                                /* FACK style, fill any remaining holes in
 578                                 * receiver's queue.
 579                                 */
 580                                tcp_fack_retransmit(sk);
 581                        }
 582                }
 583        }
 584}
 585
 586/* This is Jacobson's slow start and congestion avoidance. 
 587 * SIGCOMM '88, p. 328.
 588 */
 589static __inline__ void tcp_cong_avoid(struct tcp_opt *tp)
 590{
 591        if (tp->snd_cwnd <= tp->snd_ssthresh) {
 592                /* In "safe" area, increase. */
 593                tp->snd_cwnd++;
 594        } else {
 595                /* In dangerous area, increase slowly.
 596                 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
 597                 */
 598                if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
 599                        tp->snd_cwnd++;
 600                        tp->snd_cwnd_cnt=0;
 601                } else
 602                        tp->snd_cwnd_cnt++;
 603        }       
 604}
 605
 606/* Remove acknowledged frames from the retransmission queue. */
 607static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack,
 608                               __u32 *seq, __u32 *seq_rtt)
 609{
 610        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 611        struct sk_buff *skb;
 612        unsigned long now = jiffies;
 613        int acked = 0;
 614
 615        /* If we are retransmitting, and this ACK clears up to
 616         * the retransmit head, or further, then clear our state.
 617         */
 618        if (tp->retrans_head != NULL &&
 619            !before(ack, TCP_SKB_CB(tp->retrans_head)->end_seq))
 620                tp->retrans_head = NULL;
 621
 622        while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) {
 623                struct tcp_skb_cb *scb = TCP_SKB_CB(skb); 
 624                __u8 sacked = scb->sacked;
 625                
 626                /* If our packet is before the ack sequence we can
 627                 * discard it as it's confirmed to have arrived at
 628                 * the other end.
 629                 */
 630                if (after(scb->end_seq, ack))
 631                        break;
 632
 633                /* Initial outgoing SYN's get put onto the write_queue
 634                 * just like anything else we transmit.  It is not
 635                 * true data, and if we misinform our callers that
 636                 * this ACK acks real data, we will erroneously exit
 637                 * connection startup slow start one packet too
 638                 * quickly.  This is severely frowned upon behavior.
 639                 */
 640                if((sacked & TCPCB_SACKED_RETRANS) && tp->retrans_out)
 641                        tp->retrans_out--;
 642                if(!(scb->flags & TCPCB_FLAG_SYN)) {
 643                        acked |= FLAG_DATA_ACKED;
 644                        if(sacked & TCPCB_SACKED_RETRANS)
 645                                acked |= FLAG_RETRANS_DATA_ACKED;
 646                        if(tp->fackets_out)
 647                                tp->fackets_out--;
 648                } else {
 649                        /* This is pure paranoia. */
 650                        tp->retrans_head = NULL;
 651                }               
 652                tp->packets_out--;
 653                *seq = scb->seq;
 654                *seq_rtt = now - scb->when;
 655                __skb_unlink(skb, skb->list);
 656                kfree_skb(skb);
 657        }
 658        return acked;
 659}
 660
 661static void tcp_ack_probe(struct sock *sk, __u32 ack)
 662{
 663        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 664        
 665        /* Our probe was answered. */
 666        tp->probes_out = 0;
 667        
 668        /* Was it a usable window open? */
 669
 670        /* should always be non-null */
 671        if (tp->send_head != NULL &&
 672            !before (ack + tp->snd_wnd, TCP_SKB_CB(tp->send_head)->end_seq)) {
 673                tp->backoff = 0;
 674                tp->pending = 0;
 675                tcp_clear_xmit_timer(sk, TIME_PROBE0);
 676        } else {
 677                tcp_reset_xmit_timer(sk, TIME_PROBE0,
 678                                     min(tp->rto << tp->backoff, 120*HZ));
 679        }
 680}
 681 
 682/* Should we open up the congestion window? */
 683static __inline__ int should_advance_cwnd(struct tcp_opt *tp, int flag)
 684{
 685        /* Data must have been acked. */
 686        if ((flag & FLAG_DATA_ACKED) == 0)
 687                return 0;
 688
 689        /* Some of the data acked was retransmitted somehow? */
 690        if ((flag & FLAG_RETRANS_DATA_ACKED) != 0) {
 691                /* We advance in all cases except during
 692                 * non-FACK fast retransmit/recovery.
 693                 */
 694                if (tp->fackets_out != 0 ||
 695                    tp->retransmits != 0)
 696                        return 1;
 697
 698                /* Non-FACK fast retransmit does it's own
 699                 * congestion window management, don't get
 700                 * in the way.
 701                 */
 702                return 0;
 703        }
 704
 705        /* New non-retransmitted data acked, always advance.  */
 706        return 1;
 707}
 708
 709/* Read draft-ietf-tcplw-high-performance before mucking
 710 * with this code. (Superceeds RFC1323)
 711 */
 712static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp,
 713                               u32 seq, u32 ack, int flag)
 714{
 715        __u32 seq_rtt;
 716
 717        /* RTTM Rule: A TSecr value received in a segment is used to
 718         * update the averaged RTT measurement only if the segment
 719         * acknowledges some new data, i.e., only if it advances the
 720         * left edge of the send window.
 721         *
 722         * See draft-ietf-tcplw-high-performance-00, section 3.3.
 723         * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
 724         */
 725        if (!(flag & FLAG_DATA_ACKED))
 726                return;
 727
 728        seq_rtt = jiffies-tp->rcv_tsecr;
 729        tcp_rtt_estimator(tp, seq_rtt);
 730        if (tp->retransmits) {
 731                if (tp->packets_out == 0) {
 732                        tp->retransmits = 0;
 733                        tp->fackets_out = 0;
 734                        tp->retrans_out = 0;
 735                        tp->backoff = 0;
 736                        tcp_set_rto(tp);
 737                } else {
 738                        /* Still retransmitting, use backoff */
 739                        tcp_set_rto(tp);
 740                        tp->rto = tp->rto << tp->backoff;
 741                }
 742        } else {
 743                tcp_set_rto(tp);
 744        }
 745
 746        tcp_bound_rto(tp);
 747}
 748
 749static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
 750{
 751        struct sk_buff *skb = skb_peek(&sk->write_queue);
 752        long when = tp->rto - (jiffies - TCP_SKB_CB(skb)->when);
 753
 754        /* Some data was ACK'd, if still retransmitting (due to a
 755         * timeout), resend more of the retransmit queue.  The
 756         * congestion window is handled properly by that code.
 757         */
 758        if (tp->retransmits) {
 759                tcp_xmit_retransmit_queue(sk);
 760                tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 761        } else {
 762                tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
 763        }
 764}
 765
 766/* This routine deals with incoming acks, but not outgoing ones. */
 767static int tcp_ack(struct sock *sk, struct tcphdr *th, 
 768                   u32 ack_seq, u32 ack, int len)
 769{
 770        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 771        int flag = 0;
 772        u32 seq = 0;
 773        u32 seq_rtt = 0;
 774
 775        if(sk->zapped)
 776                return(1);      /* Dead, can't ack any more so why bother */
 777
 778        if (tp->pending == TIME_KEEPOPEN)
 779                tp->probes_out = 0;
 780
 781        tp->rcv_tstamp = jiffies;
 782
 783        /* If the ack is newer than sent or older than previous acks
 784         * then we can probably ignore it.
 785         */
 786        if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una))
 787                goto uninteresting_ack;
 788
 789        dst_confirm(sk->dst_cache);
 790
 791        /* If there is data set flag 1 */
 792        if (len != th->doff*4) {
 793                flag |= FLAG_DATA;
 794                tcp_delack_estimator(tp);
 795        }
 796
 797        /* Update our send window. */
 798
 799        /* This is the window update code as per RFC 793
 800         * snd_wl{1,2} are used to prevent unordered
 801         * segments from shrinking the window 
 802         */
 803        if (before(tp->snd_wl1, ack_seq) ||
 804            (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) {
 805                u32 nwin = ntohs(th->window) << tp->snd_wscale;
 806
 807                if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) {
 808                        flag |= FLAG_WIN_UPDATE;
 809                        tp->snd_wnd = nwin;
 810
 811                        tp->snd_wl1 = ack_seq;
 812                        tp->snd_wl2 = ack;
 813
 814                        if (nwin > tp->max_window)
 815                                tp->max_window = nwin;
 816                }
 817        }
 818
 819        /* We passed data and got it acked, remove any soft error
 820         * log. Something worked...
 821         */
 822        sk->err_soft = 0;
 823
 824        /* If this ack opens up a zero window, clear backoff.  It was
 825         * being used to time the probes, and is probably far higher than
 826         * it needs to be for normal retransmission.
 827         */
 828        if (tp->pending == TIME_PROBE0)
 829                tcp_ack_probe(sk, ack);
 830
 831        /* See if we can take anything off of the retransmit queue. */
 832        flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt);
 833
 834        /* We must do this here, before code below clears out important
 835         * state contained in tp->fackets_out and tp->retransmits.  -DaveM
 836         */
 837        if (should_advance_cwnd(tp, flag))
 838                tcp_cong_avoid(tp);
 839
 840        /* If we have a timestamp, we always do rtt estimates. */
 841        if (tp->saw_tstamp) {
 842                tcp_ack_saw_tstamp(sk, tp, seq, ack, flag);
 843        } else {
 844                /* If we were retransmiting don't count rtt estimate. */
 845                if (tp->retransmits) {
 846                        if (tp->packets_out == 0) {
 847                                tp->retransmits = 0;
 848                                tp->fackets_out = 0;
 849                                tp->retrans_out = 0;
 850                        }
 851                } else {
 852                        /* We don't have a timestamp. Can only use
 853                         * packets that are not retransmitted to determine
 854                         * rtt estimates. Also, we must not reset the
 855                         * backoff for rto until we get a non-retransmitted
 856                         * packet. This allows us to deal with a situation
 857                         * where the network delay has increased suddenly.
 858                         * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
 859                         */
 860                        if (flag & FLAG_DATA_ACKED) {
 861                                if(!(flag & FLAG_RETRANS_DATA_ACKED)) {
 862                                        tp->backoff = 0;
 863                                        tcp_rtt_estimator(tp, seq_rtt);
 864                                        tcp_set_rto(tp);
 865                                        tcp_bound_rto(tp);
 866                                }
 867                        }
 868                }
 869        }
 870
 871        if (tp->packets_out) {
 872                if (flag & FLAG_DATA_ACKED)
 873                        tcp_ack_packets_out(sk, tp);
 874        } else {
 875                tcp_clear_xmit_timer(sk, TIME_RETRANS);
 876        }
 877
 878        flag &= (FLAG_DATA | FLAG_WIN_UPDATE);
 879        if ((ack == tp->snd_una && tp->packets_out && flag == 0) ||
 880            (tp->high_seq != 0)) {
 881                tcp_fast_retrans(sk, ack, flag);
 882        } else {
 883                /* Clear any aborted fast retransmit starts. */
 884                tp->dup_acks = 0;
 885        }
 886        /* Remember the highest ack received. */
 887        tp->snd_una = ack;
 888        return 1;
 889
 890uninteresting_ack:
 891        SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt);
 892        return 0;
 893}
 894
 895/* New-style handling of TIME_WAIT sockets. */
 896extern void tcp_tw_schedule(struct tcp_tw_bucket *tw);
 897extern void tcp_tw_reschedule(struct tcp_tw_bucket *tw);
 898extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw);
 899
 900void tcp_timewait_kill(struct tcp_tw_bucket *tw)
 901{
 902        /* Unlink from various places. */
 903        if(tw->bind_next)
 904                tw->bind_next->bind_pprev = tw->bind_pprev;
 905        *(tw->bind_pprev) = tw->bind_next;
 906        if(tw->tb->owners == NULL)
 907                tcp_inc_slow_timer(TCP_SLT_BUCKETGC);
 908
 909        if(tw->next)
 910                tw->next->pprev = tw->pprev;
 911        *tw->pprev = tw->next;
 912
 913        /* We decremented the prot->inuse count when we entered TIME_WAIT
 914         * and the sock from which this came was destroyed.
 915         */
 916        tw->sklist_next->sklist_prev = tw->sklist_prev;
 917        tw->sklist_prev->sklist_next = tw->sklist_next;
 918
 919        /* Ok, now free it up. */
 920        kmem_cache_free(tcp_timewait_cachep, tw);
 921}
 922
 923/* We come here as a special case from the AF specific TCP input processing,
 924 * and the SKB has no owner.  Essentially handling this is very simple,
 925 * we just keep silently eating rx'd packets until none show up for the
 926 * entire timeout period.  The only special cases are for BSD TIME_WAIT
 927 * reconnects and SYN/RST bits being set in the TCP header.
 928 */
 929int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
 930                               struct tcphdr *th, unsigned len)
 931{
 932        /*      RFC 1122:
 933         *      "When a connection is [...] on TIME-WAIT state [...]
 934         *      [a TCP] MAY accept a new SYN from the remote TCP to
 935         *      reopen the connection directly, if it:
 936         *      
 937         *      (1)  assigns its initial sequence number for the new
 938         *      connection to be larger than the largest sequence
 939         *      number it used on the previous connection incarnation,
 940         *      and
 941         *
 942         *      (2)  returns to TIME-WAIT state if the SYN turns out 
 943         *      to be an old duplicate".
 944         */
 945        if(th->syn && !th->rst && after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt)) {
 946                struct sock *sk;
 947                struct tcp_func *af_specific = tw->af_specific;
 948                __u32 isn;
 949
 950                isn = tw->rcv_nxt + 128000;
 951                if(isn == 0)
 952                        isn++;
 953                tcp_tw_deschedule(tw);
 954                tcp_timewait_kill(tw);
 955                sk = af_specific->get_sock(skb, th);
 956                if(sk == NULL ||
 957                   !ipsec_sk_policy(sk,skb) ||
 958                   atomic_read(&sk->sock_readers) != 0)
 959                        return 0;
 960                skb_set_owner_r(skb, sk);
 961                af_specific = sk->tp_pinfo.af_tcp.af_specific;
 962                if(af_specific->conn_request(sk, skb, isn) < 0)
 963                        return 1; /* Toss a reset back. */
 964                return 0; /* Discard the frame. */
 965        }
 966
 967        /* Check RST or SYN */
 968        if(th->rst || th->syn) {
 969                /* This is TIME_WAIT assasination, in two flavors.
 970                 * Oh well... nobody has a sufficient solution to this
 971                 * protocol bug yet.
 972                 */
 973                if(sysctl_tcp_rfc1337 == 0) {
 974                        tcp_tw_deschedule(tw);
 975                        tcp_timewait_kill(tw);
 976                }
 977                if(!th->rst)
 978                        return 1; /* toss a reset back */
 979        } else {
 980                /* In this case we must reset the TIMEWAIT timer. */
 981                if(th->ack)
 982                        tcp_tw_reschedule(tw);
 983        }
 984        return 0; /* Discard the frame. */
 985}
 986
 987/* Enter the time wait state.  This is always called from BH
 988 * context.  Essentially we whip up a timewait bucket, copy the
 989 * relevant info into it from the SK, and mess with hash chains
 990 * and list linkage.
 991 */
 992static __inline__ void tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
 993{
 994        struct sock **head, *sktw;
 995
 996        /* Step 1: Remove SK from established hash. */
 997        if(sk->next)
 998                sk->next->pprev = sk->pprev;
 999        *sk->pprev = sk->next;
1000        sk->pprev = NULL;
1001        tcp_reg_zap(sk);
1002
1003        /* Step 2: Put TW into bind hash where SK was. */
1004        tw->tb = (struct tcp_bind_bucket *)sk->prev;
1005        if((tw->bind_next = sk->bind_next) != NULL)
1006                sk->bind_next->bind_pprev = &tw->bind_next;
1007        tw->bind_pprev = sk->bind_pprev;
1008        *sk->bind_pprev = (struct sock *)tw;
1009
1010        /* Step 3: Same for the protocol sklist. */
1011        (tw->sklist_next = sk->sklist_next)->sklist_prev = (struct sock *)tw;
1012        (tw->sklist_prev = sk->sklist_prev)->sklist_next = (struct sock *)tw;
1013        sk->sklist_next = NULL;
1014        sk->prot->inuse--;
1015
1016        /* Step 4: Hash TW into TIMEWAIT half of established hash table. */
1017        head = &tcp_established_hash[sk->hashent + (TCP_HTABLE_SIZE/2)];
1018        sktw = (struct sock *)tw;
1019        if((sktw->next = *head) != NULL)
1020                (*head)->pprev = &sktw->next;
1021        *head = sktw;
1022        sktw->pprev = head;
1023}
1024
1025void tcp_time_wait(struct sock *sk)
1026{
1027        struct tcp_tw_bucket *tw;
1028
1029        tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
1030        if(tw != NULL) {
1031                /* Give us an identity. */
1032                tw->daddr       = sk->daddr;
1033                tw->rcv_saddr   = sk->rcv_saddr;
1034                tw->bound_dev_if= sk->bound_dev_if;
1035                tw->num         = sk->num;
1036                tw->state       = TCP_TIME_WAIT;
1037                tw->sport       = sk->sport;
1038                tw->dport       = sk->dport;
1039                tw->family      = sk->family;
1040                tw->reuse       = sk->reuse;
1041                tw->rcv_nxt     = sk->tp_pinfo.af_tcp.rcv_nxt;
1042                tw->af_specific = sk->tp_pinfo.af_tcp.af_specific;
1043
1044#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1045                if(tw->family == PF_INET6) {
1046                        memcpy(&tw->v6_daddr,
1047                               &sk->net_pinfo.af_inet6.daddr,
1048                               sizeof(struct in6_addr));
1049                        memcpy(&tw->v6_rcv_saddr,
1050                               &sk->net_pinfo.af_inet6.rcv_saddr,
1051                               sizeof(struct in6_addr));
1052                }
1053#endif
1054                /* Linkage updates. */
1055                tcp_tw_hashdance(sk, tw);
1056
1057                /* Get the TIME_WAIT timeout firing. */
1058                tcp_tw_schedule(tw);
1059
1060                /* CLOSE the SK. */
1061                if(sk->state == TCP_ESTABLISHED)
1062                        tcp_statistics.TcpCurrEstab--;
1063                sk->state = TCP_CLOSE;
1064                net_reset_timer(sk, TIME_DONE,
1065                                min(sk->tp_pinfo.af_tcp.srtt * 2, TCP_DONE_TIME));
1066        } else {
1067                /* Sorry, we're out of memory, just CLOSE this
1068                 * socket up.  We've got bigger problems than
1069                 * non-graceful socket closings.
1070                 */
1071                tcp_set_state(sk, TCP_CLOSE);
1072        }
1073
1074        /* Prevent rcvmsg/sndmsg calls, and wake people up. */
1075        sk->shutdown = SHUTDOWN_MASK;
1076        if(!sk->dead)
1077                sk->state_change(sk);
1078}
1079
1080/*
1081 *      Process the FIN bit. This now behaves as it is supposed to work
1082 *      and the FIN takes effect when it is validly part of sequence
1083 *      space. Not before when we get holes.
1084 *
1085 *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
1086 *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
1087 *      TIME-WAIT)
1088 *
1089 *      If we are in FINWAIT-1, a received FIN indicates simultaneous
1090 *      close and we go into CLOSING (and later onto TIME-WAIT)
1091 *
1092 *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
1093 */
1094 
1095static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
1096{
1097        sk->tp_pinfo.af_tcp.fin_seq = TCP_SKB_CB(skb)->end_seq;
1098
1099        tcp_send_ack(sk);
1100
1101        if (!sk->dead) {
1102                sk->state_change(sk);
1103                sock_wake_async(sk->socket, 1);
1104        }
1105
1106        switch(sk->state) {
1107                case TCP_SYN_RECV:
1108                case TCP_ESTABLISHED:
1109                        /* Move to CLOSE_WAIT */
1110                        tcp_set_state(sk, TCP_CLOSE_WAIT);
1111                        if (th->rst)
1112                                sk->shutdown = SHUTDOWN_MASK;
1113                        break;
1114
1115                case TCP_CLOSE_WAIT:
1116                case TCP_CLOSING:
1117                        /* Received a retransmission of the FIN, do
1118                         * nothing.
1119                         */
1120                        break;
1121                case TCP_LAST_ACK:
1122                        /* RFC793: Remain in the LAST-ACK state. */
1123                        break;
1124
1125                case TCP_FIN_WAIT1:
1126                        /* This case occurs when a simultaneous close
1127                         * happens, we must ack the received FIN and
1128                         * enter the CLOSING state.
1129                         *
1130                         * This causes a WRITE timeout, which will either
1131                         * move on to TIME_WAIT when we timeout, or resend
1132                         * the FIN properly (maybe we get rid of that annoying
1133                         * FIN lost hang). The TIME_WRITE code is already 
1134                         * correct for handling this timeout.
1135                         */
1136                        tcp_set_state(sk, TCP_CLOSING);
1137                        break;
1138                case TCP_FIN_WAIT2:
1139                        /* Received a FIN -- send ACK and enter TIME_WAIT. */
1140                        tcp_time_wait(sk);
1141                        break;
1142                default:
1143                        /* Only TCP_LISTEN and TCP_CLOSE are left, in these
1144                         * cases we should never reach this piece of code.
1145                         */
1146                        printk("tcp_fin: Impossible, sk->state=%d\n", sk->state);
1147                        break;
1148        };
1149}
1150
1151/* These routines update the SACK block as out-of-order packets arrive or
1152 * in-order packets close up the sequence space.
1153 */
1154static void tcp_sack_maybe_coalesce(struct tcp_opt *tp, struct tcp_sack_block *sp)
1155{
1156        int this_sack, num_sacks = tp->num_sacks;
1157        struct tcp_sack_block *swalk = &tp->selective_acks[0];
1158
1159        /* If more than one SACK block, see if the recent change to SP eats into
1160         * or hits the sequence space of other SACK blocks, if so coalesce.
1161         */
1162        if(num_sacks != 1) {
1163                for(this_sack = 0; this_sack < num_sacks; this_sack++, swalk++) {
1164                        if(swalk == sp)
1165                                continue;
1166
1167                        /* First case, bottom of SP moves into top of the
1168                         * sequence space of SWALK.
1169                         */
1170                        if(between(sp->start_seq, swalk->start_seq, swalk->end_seq)) {
1171                                sp->start_seq = swalk->start_seq;
1172                                goto coalesce;
1173                        }
1174                        /* Second case, top of SP moves into bottom of the
1175                         * sequence space of SWALK.
1176                         */
1177                        if(between(sp->end_seq, swalk->start_seq, swalk->end_seq)) {
1178                                sp->end_seq = swalk->end_seq;
1179                                goto coalesce;
1180                        }
1181                }
1182        }
1183        /* SP is the only SACK, or no coalescing cases found. */
1184        return;
1185
1186coalesce:
1187        /* Zap SWALK, by moving every further SACK up by one slot.
1188         * Decrease num_sacks.
1189         */
1190        for(; this_sack < num_sacks-1; this_sack++, swalk++) {
1191                struct tcp_sack_block *next = (swalk + 1);
1192                swalk->start_seq = next->start_seq;
1193                swalk->end_seq = next->end_seq;
1194        }
1195        tp->num_sacks--;
1196}
1197
1198static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
1199{
1200        __u32 tmp;
1201
1202        tmp = sack1->start_seq;
1203        sack1->start_seq = sack2->start_seq;
1204        sack2->start_seq = tmp;
1205
1206        tmp = sack1->end_seq;
1207        sack1->end_seq = sack2->end_seq;
1208        sack2->end_seq = tmp;
1209}
1210
1211static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb)
1212{
1213        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1214        struct tcp_sack_block *sp = &tp->selective_acks[0];
1215        int cur_sacks = tp->num_sacks;
1216
1217        if (!cur_sacks)
1218                goto new_sack;
1219
1220        /* Optimize for the common case, new ofo frames arrive
1221         * "in order". ;-)  This also satisfies the requirements
1222         * of RFC2018 about ordering of SACKs.
1223         */
1224        if(sp->end_seq == TCP_SKB_CB(skb)->seq) {
1225                sp->end_seq = TCP_SKB_CB(skb)->end_seq;
1226                tcp_sack_maybe_coalesce(tp, sp);
1227        } else if(sp->start_seq == TCP_SKB_CB(skb)->end_seq) {
1228                /* Re-ordered arrival, in this case, can be optimized
1229                 * as well.
1230                 */
1231                sp->start_seq = TCP_SKB_CB(skb)->seq;
1232                tcp_sack_maybe_coalesce(tp, sp);
1233        } else {
1234                struct tcp_sack_block *swap = sp + 1;
1235                int this_sack, max_sacks = (tp->tstamp_ok ? 3 : 4);
1236
1237                /* Oh well, we have to move things around.
1238                 * Try to find a SACK we can tack this onto.
1239                 */
1240
1241                for(this_sack = 1; this_sack < cur_sacks; this_sack++, swap++) {
1242                        if((swap->end_seq == TCP_SKB_CB(skb)->seq) ||
1243                           (swap->start_seq == TCP_SKB_CB(skb)->end_seq)) {
1244                                if(swap->end_seq == TCP_SKB_CB(skb)->seq)
1245                                        swap->end_seq = TCP_SKB_CB(skb)->end_seq;
1246                                else
1247                                        swap->start_seq = TCP_SKB_CB(skb)->seq;
1248                                tcp_sack_swap(sp, swap);
1249                                tcp_sack_maybe_coalesce(tp, sp);
1250                                return;
1251                        }
1252                }
1253
1254                /* Could not find an adjacent existing SACK, build a new one,
1255                 * put it at the front, and shift everyone else down.  We
1256                 * always know there is at least one SACK present already here.
1257                 *
1258                 * If the sack array is full, forget about the last one.
1259                 */
1260                if (cur_sacks >= max_sacks) {
1261                        cur_sacks--;
1262                        tp->num_sacks--;
1263                }
1264                while(cur_sacks >= 1) {
1265                        struct tcp_sack_block *this = &tp->selective_acks[cur_sacks];
1266                        struct tcp_sack_block *prev = (this - 1);
1267                        this->start_seq = prev->start_seq;
1268                        this->end_seq = prev->end_seq;
1269                        cur_sacks--;
1270                }
1271
1272        new_sack:
1273                /* Build the new head SACK, and we're done. */
1274                sp->start_seq = TCP_SKB_CB(skb)->seq;
1275                sp->end_seq = TCP_SKB_CB(skb)->end_seq;
1276                tp->num_sacks++;
1277        }
1278}
1279
1280static void tcp_sack_remove_skb(struct tcp_opt *tp, struct sk_buff *skb)
1281{
1282        struct tcp_sack_block *sp = &tp->selective_acks[0];
1283        int num_sacks = tp->num_sacks;
1284        int this_sack;
1285
1286        /* This is an in order data segment _or_ an out-of-order SKB being
1287         * moved to the receive queue, so we know this removed SKB will eat
1288         * from the front of a SACK.
1289         */
1290        for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
1291                /* Check if the start of the sack is covered by skb. */
1292                if(!before(sp->start_seq, TCP_SKB_CB(skb)->seq) &&
1293                   before(sp->start_seq, TCP_SKB_CB(skb)->end_seq))
1294                        break;
1295        }
1296
1297        /* This should only happen if so many SACKs get built that some get
1298         * pushed out before we get here, or we eat some in sequence packets
1299         * which are before the first SACK block.
1300         */
1301        if(this_sack >= num_sacks)
1302                return;
1303
1304        sp->start_seq = TCP_SKB_CB(skb)->end_seq;
1305        if(!before(sp->start_seq, sp->end_seq)) {
1306                /* Zap this SACK, by moving forward any other SACKS. */
1307                for(this_sack += 1; this_sack < num_sacks; this_sack++, sp++) {
1308                        struct tcp_sack_block *next = (sp + 1);
1309                        sp->start_seq = next->start_seq;
1310                        sp->end_seq = next->end_seq;
1311                }
1312                tp->num_sacks--;
1313        }
1314}
1315
1316static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct sk_buff *new_skb)
1317{
1318        struct tcp_sack_block *sp = &tp->selective_acks[0];
1319        int num_sacks = tp->num_sacks;
1320        int this_sack;
1321
1322        for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
1323                if(sp->end_seq == TCP_SKB_CB(old_skb)->end_seq)
1324                        break;
1325        }
1326        if(this_sack >= num_sacks)
1327                return;
1328        sp->end_seq = TCP_SKB_CB(new_skb)->end_seq;
1329}
1330
1331/* This one checks to see if we can put data from the
1332 * out_of_order queue into the receive_queue.
1333 */
1334static void tcp_ofo_queue(struct sock *sk)
1335{
1336        struct sk_buff *skb;
1337        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1338
1339        while ((skb = skb_peek(&tp->out_of_order_queue))) {
1340                if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
1341                        break;
1342
1343                if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
1344                        SOCK_DEBUG(sk, "ofo packet was already received \n");
1345                        __skb_unlink(skb, skb->list);
1346                        kfree_skb(skb);
1347                        continue;
1348                }
1349                SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
1350                           tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
1351                           TCP_SKB_CB(skb)->end_seq);
1352
1353                if(tp->sack_ok)
1354                        tcp_sack_remove_skb(tp, skb);
1355                __skb_unlink(skb, skb->list);
1356                __skb_queue_tail(&sk->receive_queue, skb);
1357                tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1358                if(skb->h.th->fin)
1359                        tcp_fin(skb, sk, skb->h.th);
1360        }
1361}
1362
1363static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
1364{
1365        struct sk_buff *skb1;
1366        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1367
1368        /*  Queue data for delivery to the user.
1369         *  Packets in sequence go to the receive queue.
1370         *  Out of sequence packets to the out_of_order_queue.
1371         */
1372        if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
1373                /* Ok. In sequence. */
1374        queue_and_out:
1375                dst_confirm(sk->dst_cache);
1376                __skb_queue_tail(&sk->receive_queue, skb);
1377                tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1378                if(skb->h.th->fin) {
1379                        tcp_fin(skb, sk, skb->h.th);
1380                } else {
1381                        tcp_remember_ack(tp, skb->h.th, skb); 
1382                }
1383                /* This may have eaten into a SACK block. */
1384                if(tp->sack_ok && tp->num_sacks)
1385                        tcp_sack_remove_skb(tp, skb);
1386                tcp_ofo_queue(sk);
1387
1388                /* Turn on fast path. */ 
1389                if (skb_queue_len(&tp->out_of_order_queue) == 0)
1390                        tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) |
1391                                               (0x10 << 16) |
1392                                               tp->snd_wnd);
1393                return;
1394        }
1395        
1396        /* An old packet, either a retransmit or some packet got lost. */
1397        if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
1398                /* A retransmit, 2nd most common case.  Force an imediate ack. */
1399                SOCK_DEBUG(sk, "retransmit received: seq %X\n", TCP_SKB_CB(skb)->seq);
1400                tcp_enter_quickack_mode(tp);
1401                kfree_skb(skb);
1402                return;
1403        }
1404
1405        if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
1406                /* Partial packet, seq < rcv_next < end_seq */
1407                SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
1408                           tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
1409                           TCP_SKB_CB(skb)->end_seq);
1410
1411                goto queue_and_out;
1412        }
1413
1414        /* Ok. This is an out_of_order segment, force an ack. */
1415        tp->delayed_acks++;
1416        tcp_enter_quickack_mode(tp);
1417
1418        /* Disable header prediction. */
1419        tp->pred_flags = 0;
1420
1421        SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
1422                   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
1423
1424        if (skb_peek(&tp->out_of_order_queue) == NULL) {
1425                /* Initial out of order segment, build 1 SACK. */
1426                if(tp->sack_ok) {
1427                        tp->num_sacks = 1;
1428                        tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
1429                        tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq;
1430                }
1431                __skb_queue_head(&tp->out_of_order_queue,skb);
1432        } else {
1433                for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) {
1434                        /* Already there. */
1435                        if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb1)->seq) {
1436                                if (skb->len >= skb1->len) {
1437                                        if(tp->sack_ok)
1438                                                tcp_sack_extend(tp, skb1, skb);
1439                                        __skb_append(skb1, skb);
1440                                        __skb_unlink(skb1, skb1->list);
1441                                        kfree_skb(skb1);
1442                                } else {
1443                                        /* A duplicate, smaller than what is in the
1444                                         * out-of-order queue right now, toss it.
1445                                         */
1446                                        kfree_skb(skb);
1447                                }
1448                                break;
1449                        }
1450                        
1451                        if (after(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) {
1452                                __skb_append(skb1, skb);
1453                                if(tp->sack_ok)
1454                                        tcp_sack_new_ofo_skb(sk, skb);
1455                                break;
1456                        }
1457
1458                        /* See if we've hit the start. If so insert. */
1459                        if (skb1 == skb_peek(&tp->out_of_order_queue)) {
1460                                __skb_queue_head(&tp->out_of_order_queue,skb);
1461                                if(tp->sack_ok)
1462                                        tcp_sack_new_ofo_skb(sk, skb);
1463                                break;
1464                        }
1465                }
1466        }
1467}
1468
1469
1470/*
1471 *      This routine handles the data.  If there is room in the buffer,
1472 *      it will be have already been moved into it.  If there is no
1473 *      room, then we will just have to discard the packet.
1474 */
1475
1476static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
1477{
1478        struct tcphdr *th;
1479        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1480
1481        th = skb->h.th;
1482        skb_pull(skb, th->doff*4);
1483        skb_trim(skb, len - (th->doff*4));
1484
1485        if (skb->len == 0 && !th->fin)
1486                return(0);
1487
1488        /* 
1489         *      If our receive queue has grown past its limits shrink it.
1490         *      Make sure to do this before moving snd_nxt, otherwise
1491         *      data might be acked for that we don't have enough room.
1492         */
1493        if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) { 
1494                if (prune_queue(sk) < 0) { 
1495                        /* Still not enough room. That can happen when
1496                         * skb->true_size differs significantly from skb->len.
1497                         */
1498                        return 0;
1499                }
1500        }
1501
1502        tcp_data_queue(sk, skb);
1503
1504        if (before(tp->rcv_nxt, tp->copied_seq)) {
1505                printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n");
1506                tp->rcv_nxt = tp->copied_seq;
1507        }
1508
1509        /* Above, tcp_data_queue() increments delayed_acks appropriately.
1510         * Now tell the user we may have some data.
1511         */
1512        if (!sk->dead) {
1513                SOCK_DEBUG(sk, "Data wakeup.\n");
1514                sk->data_ready(sk,0);
1515        }
1516        return(1);
1517}
1518
1519static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
1520{
1521        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1522
1523        if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) &&
1524            tcp_packets_in_flight(tp) < tp->snd_cwnd) {
1525                /* Put more data onto the wire. */
1526                tcp_write_xmit(sk);
1527        } else if (tp->packets_out == 0 && !tp->pending) {
1528                /* Start probing the receivers window. */
1529                tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
1530        }
1531}
1532
1533static __inline__ void tcp_data_snd_check(struct sock *sk)
1534{
1535        struct sk_buff *skb = sk->tp_pinfo.af_tcp.send_head;
1536
1537        if (skb != NULL)
1538                __tcp_data_snd_check(sk, skb); 
1539}
1540
1541/* 
1542 * Adapt the MSS value used to make delayed ack decision to the 
1543 * real world. 
1544 */ 
1545static __inline__ void tcp_measure_rcv_mss(struct sock *sk, struct sk_buff *skb)
1546{
1547        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1548        unsigned int len = skb->len, lss; 
1549
1550        if (len > tp->rcv_mss) 
1551                tp->rcv_mss = len; 
1552        lss = tp->last_seg_size; 
1553        tp->last_seg_size = 0; 
1554        if (len >= 536) {
1555                if (len == lss) 
1556                        tp->rcv_mss = len; 
1557                tp->last_seg_size = len; 
1558        }
1559}
1560
1561/*
1562 * Check if sending an ack is needed.
1563 */
1564static __inline__ void __tcp_ack_snd_check(struct sock *sk)
1565{
1566        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1567
1568        /* This also takes care of updating the window.
1569         * This if statement needs to be simplified.
1570         *
1571         * Rules for delaying an ack:
1572         *      - delay time <= 0.5 HZ
1573         *      - we don't have a window update to send
1574         *      - must send at least every 2 full sized packets
1575         *      - must send an ACK if we have any out of order data
1576         *
1577         * With an extra heuristic to handle loss of packet
1578         * situations and also helping the sender leave slow
1579         * start in an expediant manner.
1580         */
1581
1582            /* Two full frames received or... */
1583        if (((tp->rcv_nxt - tp->rcv_wup) >= tp->rcv_mss * MAX_DELAY_ACK) ||
1584            /* We will update the window "significantly" or... */
1585            tcp_raise_window(sk) ||
1586            /* We entered "quick ACK" mode or... */
1587            tcp_in_quickack_mode(tp) ||
1588            /* We have out of order data */
1589            (skb_peek(&tp->out_of_order_queue) != NULL)) {
1590                /* Then ack it now */
1591                tcp_send_ack(sk);
1592        } else {
1593                /* Else, send delayed ack. */
1594                tcp_send_delayed_ack(tp, HZ/2);
1595        }
1596}
1597
1598static __inline__ void tcp_ack_snd_check(struct sock *sk)
1599{
1600        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1601        if (tp->delayed_acks == 0) {
1602                /* We sent a data segment already. */
1603                return;
1604        }
1605        __tcp_ack_snd_check(sk);
1606}
1607
1608
1609/*
1610 *      This routine is only called when we have urgent data
1611 *      signalled. Its the 'slow' part of tcp_urg. It could be
1612 *      moved inline now as tcp_urg is only called from one
1613 *      place. We handle URGent data wrong. We have to - as
1614 *      BSD still doesn't use the correction from RFC961.
1615 *      For 1003.1g we should support a new option TCP_STDURG to permit
1616 *      either form (or just set the sysctl tcp_stdurg).
1617 */
1618 
1619static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
1620{
1621        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1622        u32 ptr = ntohs(th->urg_ptr);
1623
1624        if (ptr && !sysctl_tcp_stdurg)
1625                ptr--;
1626        ptr += ntohl(th->seq);
1627
1628        /* Ignore urgent data that we've already seen and read. */
1629        if (after(tp->copied_seq, ptr))
1630                return;
1631
1632        /* Do we already have a newer (or duplicate) urgent pointer? */
1633        if (tp->urg_data && !after(ptr, tp->urg_seq))
1634                return;
1635
1636        /* Tell the world about our new urgent pointer. */
1637        if (sk->proc != 0) {
1638                if (sk->proc > 0)
1639                        kill_proc(sk->proc, SIGURG, 1);
1640                else
1641                        kill_pg(-sk->proc, SIGURG, 1);
1642        }
1643
1644        /* We may be adding urgent data when the last byte read was
1645         * urgent. To do this requires some care. We cannot just ignore
1646         * tp->copied_seq since we would read the last urgent byte again
1647         * as data, nor can we alter copied_seq until this data arrives
1648         * or we break the sematics of SIOCATMARK (and thus sockatmark())
1649         */
1650        if (tp->urg_seq == tp->copied_seq)
1651                tp->copied_seq++;       /* Move the copied sequence on correctly */
1652        tp->urg_data = URG_NOTYET;
1653        tp->urg_seq = ptr;
1654
1655        /* Disable header prediction. */
1656        tp->pred_flags = 0;
1657}
1658
1659/* This is the 'fast' part of urgent handling. */
1660static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
1661{
1662        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1663
1664        /* Check if we get a new urgent pointer - normally not. */
1665        if (th->urg)
1666                tcp_check_urg(sk,th);
1667
1668        /* Do we wait for any urgent data? - normally not... */
1669        if (tp->urg_data == URG_NOTYET) {
1670                u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4);
1671
1672                /* Is the urgent pointer pointing into this packet? */   
1673                if (ptr < len) {
1674                        tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
1675                        if (!sk->dead)
1676                                sk->data_ready(sk,0);
1677                }
1678        }
1679}
1680
1681/* Clean the out_of_order queue if we can, trying to get
1682 * the socket within its memory limits again.
1683 *
1684 * Return less than zero if we should start dropping frames
1685 * until the socket owning process reads some of the data
1686 * to stabilize the situation.
1687 */
1688static int prune_queue(struct sock *sk)
1689{
1690        struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; 
1691        struct sk_buff * skb;
1692
1693        SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
1694
1695        net_statistics.PruneCalled++; 
1696
1697        /* First, purge the out_of_order queue. */
1698        skb = __skb_dequeue_tail(&tp->out_of_order_queue);
1699        if(skb != NULL) {
1700                /* Free it all. */
1701                do {    net_statistics.OfoPruned += skb->len; 
1702                        kfree_skb(skb);
1703                        skb = __skb_dequeue_tail(&tp->out_of_order_queue);
1704                } while(skb != NULL);
1705
1706                /* Reset SACK state.  A conforming SACK implementation will
1707                 * do the same at a timeout based retransmit.  When a connection
1708                 * is in a sad state like this, we care only about integrity
1709                 * of the connection not performance.
1710                 */
1711                if(tp->sack_ok)
1712                        tp->num_sacks = 0;
1713        }
1714        
1715        /* If we are really being abused, tell the caller to silently
1716         * drop receive data on the floor.  It will get retransmitted
1717         * and hopefully then we'll have sufficient space.
1718         *
1719         * We used to try to purge the in-order packets too, but that
1720         * turns out to be deadly and fraught with races.  Consider:
1721         *
1722         * 1) If we acked the data, we absolutely cannot drop the
1723         *    packet.  This data would then never be retransmitted.
1724         * 2) It is possible, with a proper sequence of events involving
1725         *    delayed acks and backlog queue handling, to have the user
1726         *    read the data before it gets acked.  The previous code
1727         *    here got this wrong, and it lead to data corruption.
1728         * 3) Too much state changes happen when the FIN arrives, so once
1729         *    we've seen that we can't remove any in-order data safely.
1730         *
1731         * The net result is that removing in-order receive data is too
1732         * complex for anyones sanity.  So we don't do it anymore.  But
1733         * if we are really having our buffer space abused we stop accepting
1734         * new receive data.
1735         */
1736        if(atomic_read(&sk->rmem_alloc) < (sk->rcvbuf << 1))
1737                return 0;
1738
1739        /* Massive buffer overcommit. */
1740        return -1;
1741}
1742
1743/*
1744 *      TCP receive function for the ESTABLISHED state. 
1745 *
1746 *      It is split into a fast path and a slow path. The fast path is 
1747 *      disabled when:
1748 *      - A zero window was announced from us - zero window probing
1749 *        is only handled properly in the slow path. 
1750 *      - Out of order segments arrived.
1751 *      - Urgent data is expected.
1752 *      - There is no buffer space left
1753 *      - Unexpected TCP flags/window values/header lengths are received
1754 *        (detected by checking the TCP header against pred_flags) 
1755 *      - Data is sent in both directions. Fast path only supports pure senders
1756 *        or pure receivers (this means either the sequence number or the ack
1757 *        value must stay constant)
1758 *
1759 *      When these conditions are not satisfied it drops into a standard 
1760 *      receive procedure patterned after RFC793 to handle all cases.
1761 *      The first three cases are guaranteed by proper pred_flags setting,
1762 *      the rest is checked inline. Fast processing is turned on in 
1763 *      tcp_data_queue when everything is OK.
1764 */
1765int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
1766                        struct tcphdr *th, unsigned len)
1767{
1768        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1769        int queued;
1770        u32 flg;
1771
1772        /*
1773         *      Header prediction.
1774         *      The code follows the one in the famous 
1775         *      "30 instruction TCP receive" Van Jacobson mail.
1776         *      
1777         *      Van's trick is to deposit buffers into socket queue 
1778         *      on a device interrupt, to call tcp_recv function
1779         *      on the receive process context and checksum and copy
1780         *      the buffer to user space. smart...
1781         *
1782         *      Our current scheme is not silly either but we take the 
1783         *      extra cost of the net_bh soft interrupt processing...
1784         *      We do checksum and copy also but from device to kernel.
1785         */
1786
1787        /*
1788         * RFC1323: H1. Apply PAWS check first.
1789         */
1790        if (tcp_fast_parse_options(sk, th, tp)) {
1791                if (tp->saw_tstamp) {
1792                        if (tcp_paws_discard(tp, th, len)) {
1793                                tcp_statistics.TcpInErrs++;
1794                                if (!th->rst) {
1795                                        tcp_send_ack(sk);
1796                                        goto discard;
1797                                }
1798                        }
1799                        tcp_replace_ts_recent(sk, tp,
1800                                              TCP_SKB_CB(skb)->seq,
1801                                              TCP_SKB_CB(skb)->end_seq);
1802                }
1803        }
1804
1805        flg = *(((u32 *)th) + 3) & ~htonl(0x8 << 16);
1806
1807        /*      pred_flags is 0xS?10 << 16 + snd_wnd
1808         *      if header_predition is to be made
1809         *      'S' will always be tp->tcp_header_len >> 2
1810         *      '?' will be 0 else it will be !0
1811         *      (when there are holes in the receive 
1812         *       space for instance)
1813         *      PSH flag is ignored.
1814         */
1815
1816        if (flg == tp->pred_flags && TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
1817                if (len <= th->doff*4) {
1818                        /* Bulk data transfer: sender */
1819                        if (len == th->doff*4) {
1820                                tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
1821                                        TCP_SKB_CB(skb)->ack_seq, len); 
1822                                kfree_skb(skb); 
1823                                tcp_data_snd_check(sk);
1824                                return 0;
1825                        } else { /* Header too small */
1826                                tcp_statistics.TcpInErrs++;
1827                                goto discard;
1828                        }
1829                } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una &&
1830                           atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) {
1831                        /* Bulk data transfer: receiver */
1832                        __skb_pull(skb,th->doff*4);
1833
1834                        tcp_measure_rcv_mss(sk, skb); 
1835
1836                        /* DO NOT notify forward progress here.
1837                         * It saves dozen of CPU instructions in fast path. --ANK
1838                         */
1839                        __skb_queue_tail(&sk->receive_queue, skb);
1840                        tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1841
1842                        /* FIN bit check is not done since if FIN is set in
1843                         * this frame, the pred_flags won't match up. -DaveM
1844                         */
1845                        sk->data_ready(sk, 0);
1846                        tcp_delack_estimator(tp);
1847
1848                        tcp_remember_ack(tp, th, skb); 
1849
1850                        __tcp_ack_snd_check(sk);
1851                        return 0;
1852                }
1853        }
1854
1855        /*
1856         *      Standard slow path.
1857         */
1858
1859        if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
1860                /* RFC793, page 37: "In all states except SYN-SENT, all reset
1861                 * (RST) segments are validated by checking their SEQ-fields."
1862                 * And page 69: "If an incoming segment is not acceptable,
1863                 * an acknowledgment should be sent in reply (unless the RST bit
1864                 * is set, if so drop the segment and return)".
1865                 */
1866                if (th->rst)
1867                        goto discard;
1868                if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
1869                        SOCK_DEBUG(sk, "seq:%d end:%d wup:%d wnd:%d\n",
1870                                   TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
1871                                   tp->rcv_wup, tp->rcv_wnd);
1872                }
1873                tcp_send_ack(sk);
1874                goto discard;
1875        }
1876
1877        if(th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
1878                SOCK_DEBUG(sk, "syn in established state\n");
1879                tcp_statistics.TcpInErrs++;
1880                tcp_reset(sk);
1881                return 1;
1882        }
1883        
1884        if(th->rst) {
1885                tcp_reset(sk);
1886                goto discard;
1887        }
1888
1889        if(th->ack)
1890                tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len);
1891        
1892        /* Process urgent data. */
1893        tcp_urg(sk, th, len);
1894
1895        /* step 7: process the segment text */
1896        queued = tcp_data(skb, sk, len);
1897
1898        /* This must be after tcp_data() does the skb_pull() to
1899         * remove the header size from skb->len.
1900         *
1901         * Dave!!! Phrase above (and all about rcv_mss) has 
1902         * nothing to do with reality. rcv_mss must measure TOTAL
1903         * size, including sacks, IP options etc. Hence, measure_rcv_mss
1904         * must occure before pulling etc, otherwise it will flap
1905         * like hell. Even putting it before tcp_data is wrong,
1906         * it should use skb->tail - skb->nh.raw instead.
1907         *                                      --ANK (980805)
1908         * 
1909         * BTW I broke it. Now all TCP options are handled equally
1910         * in mss_clamp calculations (i.e. ignored, rfc1122),
1911         * and mss_cache does include all of them (i.e. tstamps)
1912         * except for sacks, to calulate effective mss faster.
1913         *                                      --ANK (980805)
1914         */
1915        tcp_measure_rcv_mss(sk, skb); 
1916
1917        /* Be careful, tcp_data() may have put this into TIME_WAIT. */
1918        if(sk->state != TCP_CLOSE) {
1919                tcp_data_snd_check(sk);
1920                tcp_ack_snd_check(sk);
1921        }
1922
1923        if (!queued) {
1924        discard:
1925                kfree_skb(skb);
1926        }
1927
1928        return 0;
1929}
1930
1931/* 
1932 *      Process an incoming SYN or SYN-ACK for SYN_RECV sockets represented
1933 *      as an open_request. 
1934 */
1935
1936struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, 
1937                           struct open_request *req)
1938{
1939        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1940        u32 flg;
1941
1942        /*      assumption: the socket is not in use.
1943         *      as we checked the user count on tcp_rcv and we're
1944         *      running from a soft interrupt.
1945         */
1946
1947        /* Check for syn retransmission */
1948        flg = *(((u32 *)skb->h.th) + 3);
1949        
1950        flg &= __constant_htonl(0x00170000);
1951        /* Only SYN set? */
1952        if (flg == __constant_htonl(0x00020000)) {
1953                if (!after(TCP_SKB_CB(skb)->seq, req->rcv_isn)) {
1954                        /*      retransmited syn.
1955                         */
1956                        req->class->rtx_syn_ack(sk, req); 
1957                        return NULL;
1958                } else {
1959                        return sk; /* Pass new SYN to the listen socket. */
1960                }
1961        }
1962
1963        /* We know it's an ACK here */  
1964        if (req->sk) {
1965                /*      socket already created but not
1966                 *      yet accepted()...
1967                 */
1968                sk = req->sk;
1969        } else {
1970                /* In theory the packet could be for a cookie, but
1971                 * TIME_WAIT should guard us against this. 
1972                 * XXX: Nevertheless check for cookies?
1973                 * This sequence number check is done again later,
1974                 * but we do it here to prevent syn flood attackers
1975                 * from creating big SYN_RECV sockets.
1976                 */ 
1977                if (!between(TCP_SKB_CB(skb)->ack_seq, req->snt_isn, req->snt_isn+1) ||
1978                    !between(TCP_SKB_CB(skb)->seq, req->rcv_isn, 
1979                             req->rcv_isn+1+req->rcv_wnd)) {
1980                        req->class->send_reset(skb);
1981                        return NULL;
1982                }
1983        
1984                sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
1985                tcp_dec_slow_timer(TCP_SLT_SYNACK);
1986                if (sk == NULL)
1987                        return NULL;
1988                
1989                req->expires = 0UL;
1990                req->sk = sk;
1991        }
1992        skb_orphan(skb); 
1993        skb_set_owner_r(skb, sk);
1994        return sk; 
1995}
1996
1997/*
1998 *      This function implements the receiving procedure of RFC 793 for
1999 *      all states except ESTABLISHED and TIME_WAIT. 
2000 *      It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
2001 *      address independent.
2002 */
2003        
2004int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
2005                          struct tcphdr *th, unsigned len)
2006{
2007        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2008        int queued = 0;
2009
2010        switch (sk->state) {
2011        case TCP_CLOSE:
2012                /* When state == CLOSED, hash lookup always fails.
2013                 *
2014                 * But, there is a back door, the backlog queue.
2015                 * If we have a sequence of packets in the backlog
2016                 * during __release_sock() which have a sequence such
2017                 * that:
2018                 *      packet X        causes entry to TCP_CLOSE state
2019                 *      ...
2020                 *      packet X + N    has FIN bit set
2021                 *
2022                 * We report a (luckily) harmless error in this case.
2023                 * The issue is that backlog queue processing bypasses
2024                 * any hash lookups (we know which socket packets are for).
2025                 * The correct behavior here is what 2.0.x did, since
2026                 * a TCP_CLOSE socket does not exist.  Drop the frame
2027                 * and send a RST back to the other end.
2028                 */
2029                return 1;
2030
2031        case TCP_LISTEN:
2032                /* These use the socket TOS.. 
2033                 * might want to be the received TOS 
2034                 */
2035                if(th->ack)
2036                        return 1;
2037                
2038                if(th->syn) {
2039                        if(tp->af_specific->conn_request(sk, skb, 0) < 0)
2040                                return 1;
2041
2042                        /* Now we have several options: In theory there is 
2043                         * nothing else in the frame. KA9Q has an option to 
2044                         * send data with the syn, BSD accepts data with the
2045                         * syn up to the [to be] advertised window and 
2046                         * Solaris 2.1 gives you a protocol error. For now 
2047                         * we just ignore it, that fits the spec precisely 
2048                         * and avoids incompatibilities. It would be nice in
2049                         * future to drop through and process the data.
2050                         *
2051                         * Now that TTCP is starting to be used we ought to 
2052                         * queue this data.
2053                         * But, this leaves one open to an easy denial of
2054                         * service attack, and SYN cookies can't defend
2055                         * against this problem. So, we drop the data
2056                         * in the interest of security over speed.
2057                         */
2058                        goto discard;
2059                }
2060                
2061                goto discard;
2062                break;
2063
2064        case TCP_SYN_SENT:
2065                /* SYN sent means we have to look for a suitable ack and 
2066                 * either reset for bad matches or go to connected. 
2067                 * The SYN_SENT case is unusual and should
2068                 * not be in line code. [AC]
2069                 */
2070                if(th->ack) {
2071                        tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2072
2073                        /* We got an ack, but it's not a good ack. */
2074                        if(!tcp_ack(sk,th, TCP_SKB_CB(skb)->seq,
2075                                    TCP_SKB_CB(skb)->ack_seq, len)) 
2076                                return 1;
2077
2078                        if(th->rst) {
2079                                tcp_reset(sk);
2080                                goto discard;
2081                        }
2082
2083                        if(!th->syn) 
2084                                goto discard;
2085
2086                        /* Ok.. it's good. Set up sequence numbers and
2087                         * move to established.
2088                         */
2089                        tp->rcv_nxt = TCP_SKB_CB(skb)->seq+1;
2090                        tp->rcv_wup = TCP_SKB_CB(skb)->seq+1;
2091
2092                        /* RFC1323: The window in SYN & SYN/ACK segments is
2093                         * never scaled.
2094                         */
2095                        tp->snd_wnd = htons(th->window);
2096                        tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2097                        tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
2098                        tp->fin_seq = TCP_SKB_CB(skb)->seq;
2099
2100                        tcp_set_state(sk, TCP_ESTABLISHED);
2101                        tcp_parse_options(sk, th, tp, 0);
2102
2103                        if (tp->wscale_ok == 0) {
2104                                tp->snd_wscale = tp->rcv_wscale = 0;
2105                                tp->window_clamp = min(tp->window_clamp,65535);
2106                        }
2107
2108                        if (tp->tstamp_ok) {
2109                                tp->tcp_header_len =
2110                                        sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
2111                        } else
2112                                tp->tcp_header_len = sizeof(struct tcphdr);
2113                        if (tp->saw_tstamp) {
2114                                tp->ts_recent = tp->rcv_tsval;
2115                                tp->ts_recent_stamp = jiffies;
2116                        }
2117
2118                        /* Can't be earlier, doff would be wrong. */
2119                        tcp_send_ack(sk);
2120
2121                        sk->dport = th->source;
2122                        tp->copied_seq = tp->rcv_nxt;
2123
2124                        if(!sk->dead) {
2125                                sk->state_change(sk);
2126                                sock_wake_async(sk->socket, 0);
2127                        }
2128                } else {
2129                        if(th->syn && !th->rst) {
2130                                /* The previous version of the code
2131                                 * checked for "connecting to self"
2132                                 * here. that check is done now in
2133                                 * tcp_connect.
2134                                 */
2135                                tcp_set_state(sk, TCP_SYN_RECV);
2136                                tcp_parse_options(sk, th, tp, 0);
2137                                if (tp->saw_tstamp) {
2138                                        tp->ts_recent = tp->rcv_tsval;
2139                                        tp->ts_recent_stamp = jiffies;
2140                                }
2141                                
2142                                tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
2143                                tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
2144
2145                                /* RFC1323: The window in SYN & SYN/ACK segments is
2146                                 * never scaled.
2147                                 */
2148                                tp->snd_wnd = htons(th->window);
2149                                tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2150                                
2151                                tcp_send_synack(sk);
2152                        } else
2153                                break; 
2154                }
2155
2156                /* tp->tcp_header_len and tp->mss_clamp
2157                   probably changed, synchronize mss.
2158                   */
2159                tcp_sync_mss(sk, tp->pmtu_cookie);
2160                tp->rcv_mss = tp->mss_cache;
2161
2162                if (sk->state == TCP_SYN_RECV)
2163                        goto discard;
2164                
2165                goto step6; 
2166        }
2167
2168        /*   Parse the tcp_options present on this header.
2169         *   By this point we really only expect timestamps.
2170         *   Note that this really has to be here and not later for PAWS
2171         *   (RFC1323) to work.
2172         */
2173        if (tcp_fast_parse_options(sk, th, tp)) {
2174                /* NOTE: assumes saw_tstamp is never set if we didn't
2175                 * negotiate the option. tcp_fast_parse_options() must
2176                 * guarantee this.
2177                 */
2178                if (tp->saw_tstamp) {
2179                        if (tcp_paws_discard(tp, th, len)) {
2180                                tcp_statistics.TcpInErrs++;
2181                                if (!th->rst) {
2182                                        tcp_send_ack(sk);
2183                                        goto discard;
2184                                }
2185                        }
2186                        tcp_replace_ts_recent(sk, tp,
2187                                              TCP_SKB_CB(skb)->seq,
2188                                              TCP_SKB_CB(skb)->end_seq);
2189                }
2190        }
2191
2192        /* step 1: check sequence number */
2193        if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
2194                if (!th->rst) {
2195                        tcp_send_ack(sk);
2196                        goto discard;
2197                }
2198        }
2199
2200        /* step 2: check RST bit */
2201        if(th->rst) {
2202                tcp_reset(sk);
2203                goto discard;
2204        }
2205
2206        /* step 3: check security and precedence [ignored] */
2207
2208        /*      step 4:
2209         *
2210         *      Check for a SYN, and ensure it matches the SYN we were
2211         *      first sent. We have to handle the rather unusual (but valid)
2212         *      sequence that KA9Q derived products may generate of
2213         *
2214         *      SYN
2215         *                              SYN|ACK Data
2216         *      ACK     (lost)
2217         *                              SYN|ACK Data + More Data
2218         *      .. we must ACK not RST...
2219         *
2220         *      We keep syn_seq as the sequence space occupied by the 
2221         *      original syn. 
2222         */
2223
2224        if (th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
2225                tcp_reset(sk);
2226                return 1;
2227        }
2228
2229        /* step 5: check the ACK field */
2230        if (th->ack) {
2231                int acceptable = tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
2232                                         TCP_SKB_CB(skb)->ack_seq, len);
2233                
2234                switch(sk->state) {
2235                case TCP_SYN_RECV:
2236                        if (acceptable) {
2237                                tcp_set_state(sk, TCP_ESTABLISHED);
2238                                sk->dport = th->source;
2239                                tp->copied_seq = tp->rcv_nxt;
2240
2241                                if(!sk->dead)
2242                                        sk->state_change(sk);           
2243
2244                                tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
2245                                tp->snd_wnd = htons(th->window) << tp->snd_wscale;
2246                                tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
2247                                tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
2248
2249                        } else {
2250                                SOCK_DEBUG(sk, "bad ack\n");
2251                                return 1;
2252                        }
2253                        break;
2254
2255                case TCP_FIN_WAIT1:
2256                        if (tp->snd_una == tp->write_seq) {
2257                                sk->shutdown |= SEND_SHUTDOWN;
2258                                tcp_set_state(sk, TCP_FIN_WAIT2);
2259                                if (!sk->dead)
2260                                        sk->state_change(sk);
2261                                else
2262                                        tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout);
2263                        }
2264                        break;
2265
2266                case TCP_CLOSING:       
2267                        if (tp->snd_una == tp->write_seq) {
2268                                tcp_time_wait(sk);
2269                                goto discard;
2270                        }
2271                        break;
2272
2273                case TCP_LAST_ACK:
2274                        if (tp->snd_una == tp->write_seq) {
2275                                sk->shutdown = SHUTDOWN_MASK;
2276                                tcp_set_state(sk,TCP_CLOSE);
2277                                if (!sk->dead)
2278                                        sk->state_change(sk);
2279                                goto discard;
2280                        }
2281                        break;
2282                }
2283        } else
2284                goto discard;
2285
2286step6:
2287        /* step 6: check the URG bit */
2288        tcp_urg(sk, th, len);
2289
2290        /* step 7: process the segment text */
2291        switch (sk->state) {
2292        case TCP_CLOSE_WAIT:
2293        case TCP_CLOSING:
2294                if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq))
2295                        break;
2296        
2297        case TCP_FIN_WAIT1:
2298        case TCP_FIN_WAIT2:
2299                /* RFC 793 says to queue data in these states,
2300                 * RFC 1122 says we MUST send a reset. 
2301                 * BSD 4.4 also does reset.
2302                 */
2303                if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) {
2304                        if (after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
2305                                tcp_reset(sk);
2306                                return 1;
2307                        }
2308                }
2309                
2310        case TCP_ESTABLISHED: 
2311                queued = tcp_data(skb, sk, len);
2312
2313                /* This must be after tcp_data() does the skb_pull() to
2314                 * remove the header size from skb->len.
2315                 */
2316                tcp_measure_rcv_mss(sk, skb); 
2317                break;
2318        }
2319
2320        tcp_data_snd_check(sk);
2321        tcp_ack_snd_check(sk);
2322
2323        if (!queued) { 
2324discard:
2325                kfree_skb(skb);
2326        }
2327        return 0;
2328}
2329
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.