linux-bk/net/ipv4/tcp_output.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 * Version:     $Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $
   9 *
  10 * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14 *              Florian La Roche, <flla@stud.uni-sb.de>
  15 *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16 *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18 *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19 *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20 *              Jorge Cwik, <jorge@laser.satlink.net>
  21 */
  22
  23/*
  24 * Changes:     Pedro Roque     :       Retransmit queue handled by TCP.
  25 *                              :       Fragmentation on mtu decrease
  26 *                              :       Segment collapse on retransmit
  27 *                              :       AF independence
  28 *
  29 *              Linus Torvalds  :       send_delayed_ack
  30 *              David S. Miller :       Charge memory using the right skb
  31 *                                      during syn/ack processing.
  32 *              David S. Miller :       Output engine completely rewritten.
  33 *              Andrea Arcangeli:       SYNACK carry ts_recent in tsecr.
  34 *              Cacophonix Gaul :       draft-minshall-nagle-01
  35 *              J Hadi Salim    :       ECN support
  36 *
  37 */
  38
  39#include <net/tcp.h>
  40
  41#include <linux/compiler.h>
  42#include <linux/module.h>
  43#include <linux/smp_lock.h>
  44
  45/* People can turn this off for buggy TCP's found in printers etc. */
  46int sysctl_tcp_retrans_collapse = 1;
  47
  48static __inline__
  49void update_send_head(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb)
  50{
  51        tp->send_head = skb->next;
  52        if (tp->send_head == (struct sk_buff *)&sk->sk_write_queue)
  53                tp->send_head = NULL;
  54        tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
  55        if (tp->packets_out++ == 0)
  56                tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
  57}
  58
  59/* SND.NXT, if window was not shrunk.
  60 * If window has been shrunk, what should we make? It is not clear at all.
  61 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
  62 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
  63 * invalid. OK, let's make this for now:
  64 */
  65static __inline__ __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_opt *tp)
  66{
  67        if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
  68                return tp->snd_nxt;
  69        else
  70                return tp->snd_una+tp->snd_wnd;
  71}
  72
  73/* Calculate mss to advertise in SYN segment.
  74 * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
  75 *
  76 * 1. It is independent of path mtu.
  77 * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
  78 * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
  79 *    attached devices, because some buggy hosts are confused by
  80 *    large MSS.
  81 * 4. We do not make 3, we advertise MSS, calculated from first
  82 *    hop device mtu, but allow to raise it to ip_rt_min_advmss.
  83 *    This may be overridden via information stored in routing table.
  84 * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
  85 *    probably even Jumbo".
  86 */
  87static __u16 tcp_advertise_mss(struct sock *sk)
  88{
  89        struct tcp_opt *tp = tcp_sk(sk);
  90        struct dst_entry *dst = __sk_dst_get(sk);
  91        int mss = tp->advmss;
  92
  93        if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) {
  94                mss = dst_metric(dst, RTAX_ADVMSS);
  95                tp->advmss = mss;
  96        }
  97
  98        return (__u16)mss;
  99}
 100
 101/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
 102 * This is the first part of cwnd validation mechanism. */
 103static void tcp_cwnd_restart(struct tcp_opt *tp, struct dst_entry *dst)
 104{
 105        s32 delta = tcp_time_stamp - tp->lsndtime;
 106        u32 restart_cwnd = tcp_init_cwnd(tp, dst);
 107        u32 cwnd = tp->snd_cwnd;
 108
 109        tp->snd_ssthresh = tcp_current_ssthresh(tp);
 110        restart_cwnd = min(restart_cwnd, cwnd);
 111
 112        while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
 113                cwnd >>= 1;
 114        tp->snd_cwnd = max(cwnd, restart_cwnd);
 115        tp->snd_cwnd_stamp = tcp_time_stamp;
 116        tp->snd_cwnd_used = 0;
 117}
 118
 119static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb, struct sock *sk)
 120{
 121        u32 now = tcp_time_stamp;
 122
 123        if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
 124                tcp_cwnd_restart(tp, __sk_dst_get(sk));
 125
 126        tp->lsndtime = now;
 127
 128        /* If it is a reply for ato after last received
 129         * packet, enter pingpong mode.
 130         */
 131        if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
 132                tp->ack.pingpong = 1;
 133}
 134
 135static __inline__ void tcp_event_ack_sent(struct sock *sk)
 136{
 137        struct tcp_opt *tp = tcp_sk(sk);
 138
 139        tcp_dec_quickack_mode(tp);
 140        tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
 141}
 142
 143/* Chose a new window to advertise, update state in tcp_opt for the
 144 * socket, and return result with RFC1323 scaling applied.  The return
 145 * value can be stuffed directly into th->window for an outgoing
 146 * frame.
 147 */
 148static __inline__ u16 tcp_select_window(struct sock *sk)
 149{
 150        struct tcp_opt *tp = tcp_sk(sk);
 151        u32 cur_win = tcp_receive_window(tp);
 152        u32 new_win = __tcp_select_window(sk);
 153
 154        /* Never shrink the offered window */
 155        if(new_win < cur_win) {
 156                /* Danger Will Robinson!
 157                 * Don't update rcv_wup/rcv_wnd here or else
 158                 * we will not be able to advertise a zero
 159                 * window in time.  --DaveM
 160                 *
 161                 * Relax Will Robinson.
 162                 */
 163                new_win = cur_win;
 164        }
 165        tp->rcv_wnd = new_win;
 166        tp->rcv_wup = tp->rcv_nxt;
 167
 168        /* RFC1323 scaling applied */
 169        new_win >>= tp->rcv_wscale;
 170
 171        /* If we advertise zero window, disable fast path. */
 172        if (new_win == 0)
 173                tp->pred_flags = 0;
 174
 175        return new_win;
 176}
 177
 178
 179/* This routine actually transmits TCP packets queued in by
 180 * tcp_do_sendmsg().  This is used by both the initial
 181 * transmission and possible later retransmissions.
 182 * All SKB's seen here are completely headerless.  It is our
 183 * job to build the TCP header, and pass the packet down to
 184 * IP so it can do the same plus pass the packet off to the
 185 * device.
 186 *
 187 * We are working here with either a clone of the original
 188 * SKB, or a fresh unique copy made by the retransmit engine.
 189 */
 190int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 191{
 192        if(skb != NULL) {
 193                struct inet_opt *inet = inet_sk(sk);
 194                struct tcp_opt *tp = tcp_sk(sk);
 195                struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 196                int tcp_header_size = tp->tcp_header_len;
 197                struct tcphdr *th;
 198                int sysctl_flags;
 199                int err;
 200
 201#define SYSCTL_FLAG_TSTAMPS     0x1
 202#define SYSCTL_FLAG_WSCALE      0x2
 203#define SYSCTL_FLAG_SACK        0x4
 204
 205                sysctl_flags = 0;
 206                if (tcb->flags & TCPCB_FLAG_SYN) {
 207                        tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
 208                        if(sysctl_tcp_timestamps) {
 209                                tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
 210                                sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
 211                        }
 212                        if(sysctl_tcp_window_scaling) {
 213                                tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
 214                                sysctl_flags |= SYSCTL_FLAG_WSCALE;
 215                        }
 216                        if(sysctl_tcp_sack) {
 217                                sysctl_flags |= SYSCTL_FLAG_SACK;
 218                                if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
 219                                        tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
 220                        }
 221                } else if (tp->eff_sacks) {
 222                        /* A SACK is 2 pad bytes, a 2 byte header, plus
 223                         * 2 32-bit sequence numbers for each SACK block.
 224                         */
 225                        tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
 226                                            (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
 227                }
 228                th = (struct tcphdr *) skb_push(skb, tcp_header_size);
 229                skb->h.th = th;
 230                skb_set_owner_w(skb, sk);
 231
 232                /* Build TCP header and checksum it. */
 233                th->source              = inet->sport;
 234                th->dest                = inet->dport;
 235                th->seq                 = htonl(tcb->seq);
 236                th->ack_seq             = htonl(tp->rcv_nxt);
 237                *(((__u16 *)th) + 6)    = htons(((tcp_header_size >> 2) << 12) | tcb->flags);
 238                if (tcb->flags & TCPCB_FLAG_SYN) {
 239                        /* RFC1323: The window in SYN & SYN/ACK segments
 240                         * is never scaled.
 241                         */
 242                        th->window      = htons(tp->rcv_wnd);
 243                } else {
 244                        th->window      = htons(tcp_select_window(sk));
 245                }
 246                th->check               = 0;
 247                th->urg_ptr             = 0;
 248
 249                if (tp->urg_mode &&
 250                    between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
 251                        th->urg_ptr             = htons(tp->snd_up-tcb->seq);
 252                        th->urg                 = 1;
 253                }
 254
 255                if (tcb->flags & TCPCB_FLAG_SYN) {
 256                        tcp_syn_build_options((__u32 *)(th + 1),
 257                                              tcp_advertise_mss(sk),
 258                                              (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
 259                                              (sysctl_flags & SYSCTL_FLAG_SACK),
 260                                              (sysctl_flags & SYSCTL_FLAG_WSCALE),
 261                                              tp->rcv_wscale,
 262                                              tcb->when,
 263                                              tp->ts_recent);
 264                } else {
 265                        tcp_build_and_update_options((__u32 *)(th + 1),
 266                                                     tp, tcb->when);
 267
 268                        TCP_ECN_send(sk, tp, skb, tcp_header_size);
 269                }
 270                tp->af_specific->send_check(sk, th, skb->len, skb);
 271
 272                if (tcb->flags & TCPCB_FLAG_ACK)
 273                        tcp_event_ack_sent(sk);
 274
 275                if (skb->len != tcp_header_size)
 276                        tcp_event_data_sent(tp, skb, sk);
 277
 278                TCP_INC_STATS(TcpOutSegs);
 279
 280                err = tp->af_specific->queue_xmit(skb, 0);
 281                if (err <= 0)
 282                        return err;
 283
 284                tcp_enter_cwr(tp);
 285
 286                /* NET_XMIT_CN is special. It does not guarantee,
 287                 * that this packet is lost. It tells that device
 288                 * is about to start to drop packets or already
 289                 * drops some packets of the same priority and
 290                 * invokes us to send less aggressively.
 291                 */
 292                return err == NET_XMIT_CN ? 0 : err;
 293        }
 294        return -ENOBUFS;
 295#undef SYSCTL_FLAG_TSTAMPS
 296#undef SYSCTL_FLAG_WSCALE
 297#undef SYSCTL_FLAG_SACK
 298}
 299
 300
 301/* This is the main buffer sending routine. We queue the buffer
 302 * and decide whether to queue or transmit now.
 303 *
 304 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
 305 * otherwise socket can stall.
 306 */
 307void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue, unsigned cur_mss)
 308{
 309        struct tcp_opt *tp = tcp_sk(sk);
 310
 311        /* Advance write_seq and place onto the write_queue. */
 312        tp->write_seq = TCP_SKB_CB(skb)->end_seq;
 313        __skb_queue_tail(&sk->sk_write_queue, skb);
 314        tcp_charge_skb(sk, skb);
 315
 316        if (!force_queue && tp->send_head == NULL && tcp_snd_test(tp, skb, cur_mss, tp->nonagle)) {
 317                /* Send it out now. */
 318                TCP_SKB_CB(skb)->when = tcp_time_stamp;
 319                if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
 320                        tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
 321                        tcp_minshall_update(tp, cur_mss, skb);
 322                        if (tp->packets_out++ == 0)
 323                                tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
 324                        return;
 325                }
 326        }
 327        /* Queue it, remembering where we must start sending. */
 328        if (tp->send_head == NULL)
 329                tp->send_head = skb;
 330}
 331
 332/* Send _single_ skb sitting at the send head. This function requires
 333 * true push pending frames to setup probe timer etc.
 334 */
 335void tcp_push_one(struct sock *sk, unsigned cur_mss)
 336{
 337        struct tcp_opt *tp = tcp_sk(sk);
 338        struct sk_buff *skb = tp->send_head;
 339
 340        if (tcp_snd_test(tp, skb, cur_mss, TCP_NAGLE_PUSH)) {
 341                /* Send it out now. */
 342                TCP_SKB_CB(skb)->when = tcp_time_stamp;
 343                if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
 344                        tp->send_head = NULL;
 345                        tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
 346                        if (tp->packets_out++ == 0)
 347                                tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
 348                        return;
 349                }
 350        }
 351}
 352
 353/* Split fragmented skb to two parts at length len. */
 354
 355static void skb_split(struct sk_buff *skb, struct sk_buff *skb1, u32 len)
 356{
 357        int i;
 358        int pos = skb_headlen(skb);
 359
 360        if (len < pos) {
 361                /* Split line is inside header. */
 362                memcpy(skb_put(skb1, pos-len), skb->data + len, pos-len);
 363
 364                /* And move data appendix as is. */
 365                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
 366                        skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
 367
 368                skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
 369                skb_shinfo(skb)->nr_frags = 0;
 370
 371                skb1->data_len = skb->data_len;
 372                skb1->len += skb1->data_len;
 373                skb->data_len = 0;
 374                skb->len = len;
 375                skb->tail = skb->data+len;
 376        } else {
 377                int k = 0;
 378                int nfrags = skb_shinfo(skb)->nr_frags;
 379
 380                /* Second chunk has no header, nothing to copy. */
 381
 382                skb_shinfo(skb)->nr_frags = 0;
 383                skb1->len = skb1->data_len = skb->len - len;
 384                skb->len = len;
 385                skb->data_len = len - pos;
 386
 387                for (i=0; i<nfrags; i++) {
 388                        int size = skb_shinfo(skb)->frags[i].size;
 389                        if (pos + size > len) {
 390                                skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
 391
 392                                if (pos < len) {
 393                                        /* Split frag.
 394                                         * We have to variants in this case:
 395                                         * 1. Move all the frag to the second
 396                                         *    part, if it is possible. F.e.
 397                                         *    this approach is mandatory for TUX,
 398                                         *    where splitting is expensive.
 399                                         * 2. Split is accurately. We make this.
 400                                         */
 401                                        get_page(skb_shinfo(skb)->frags[i].page);
 402                                        skb_shinfo(skb1)->frags[0].page_offset += (len-pos);
 403                                        skb_shinfo(skb1)->frags[0].size -= (len-pos);
 404                                        skb_shinfo(skb)->frags[i].size = len-pos;
 405                                        skb_shinfo(skb)->nr_frags++;
 406                                }
 407                                k++;
 408                        } else {
 409                                skb_shinfo(skb)->nr_frags++;
 410                        }
 411                        pos += size;
 412                }
 413                skb_shinfo(skb1)->nr_frags = k;
 414        }
 415}
 416
 417/* Function to create two new TCP segments.  Shrinks the given segment
 418 * to the specified size and appends a new segment with the rest of the
 419 * packet to the list.  This won't be called frequently, I hope. 
 420 * Remember, these are still headerless SKBs at this point.
 421 */
 422static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
 423{
 424        struct tcp_opt *tp = tcp_sk(sk);
 425        struct sk_buff *buff;
 426        int nsize = skb->len - len;
 427        u16 flags;
 428
 429        if (skb_cloned(skb) &&
 430            skb_is_nonlinear(skb) &&
 431            pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
 432                return -ENOMEM;
 433
 434        /* Get a new skb... force flag on. */
 435        buff = tcp_alloc_skb(sk, nsize, GFP_ATOMIC);
 436        if (buff == NULL)
 437                return -ENOMEM; /* We'll just try again later. */
 438        tcp_charge_skb(sk, buff);
 439
 440        /* Correct the sequence numbers. */
 441        TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
 442        TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
 443        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
 444
 445        /* PSH and FIN should only be set in the second packet. */
 446        flags = TCP_SKB_CB(skb)->flags;
 447        TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
 448        TCP_SKB_CB(buff)->flags = flags;
 449        TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
 450        if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
 451                tp->lost_out++;
 452                tp->left_out++;
 453        }
 454        TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
 455
 456        if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) {
 457                /* Copy and checksum data tail into the new buffer. */
 458                buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
 459                                                       nsize, 0);
 460
 461                skb_trim(skb, len);
 462
 463                skb->csum = csum_block_sub(skb->csum, buff->csum, len);
 464        } else {
 465                skb->ip_summed = CHECKSUM_HW;
 466                skb_split(skb, buff, len);
 467        }
 468
 469        buff->ip_summed = skb->ip_summed;
 470
 471        /* Looks stupid, but our code really uses when of
 472         * skbs, which it never sent before. --ANK
 473         */
 474        TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
 475
 476        /* Link BUFF into the send queue. */
 477        __skb_append(skb, buff);
 478
 479        return 0;
 480}
 481
 482/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
 483 * eventually). The difference is that pulled data not copied, but
 484 * immediately discarded.
 485 */
 486unsigned char * __pskb_trim_head(struct sk_buff *skb, int len)
 487{
 488        int i, k, eat;
 489
 490        eat = len;
 491        k = 0;
 492        for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
 493                if (skb_shinfo(skb)->frags[i].size <= eat) {
 494                        put_page(skb_shinfo(skb)->frags[i].page);
 495                        eat -= skb_shinfo(skb)->frags[i].size;
 496                } else {
 497                        skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
 498                        if (eat) {
 499                                skb_shinfo(skb)->frags[k].page_offset += eat;
 500                                skb_shinfo(skb)->frags[k].size -= eat;
 501                                eat = 0;
 502                        }
 503                        k++;
 504                }
 505        }
 506        skb_shinfo(skb)->nr_frags = k;
 507
 508        skb->tail = skb->data;
 509        skb->data_len -= len;
 510        skb->len = skb->data_len;
 511        return skb->tail;
 512}
 513
 514static int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
 515{
 516        if (skb_cloned(skb) &&
 517            pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
 518                return -ENOMEM;
 519
 520        if (len <= skb_headlen(skb)) {
 521                __skb_pull(skb, len);
 522        } else {
 523                if (__pskb_trim_head(skb, len-skb_headlen(skb)) == NULL)
 524                        return -ENOMEM;
 525        }
 526
 527        TCP_SKB_CB(skb)->seq += len;
 528        skb->ip_summed = CHECKSUM_HW;
 529        return 0;
 530}
 531
 532/* This function synchronize snd mss to current pmtu/exthdr set.
 533
 534   tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
 535   for TCP options, but includes only bare TCP header.
 536
 537   tp->mss_clamp is mss negotiated at connection setup.
 538   It is minumum of user_mss and mss received with SYN.
 539   It also does not include TCP options.
 540
 541   tp->pmtu_cookie is last pmtu, seen by this function.
 542
 543   tp->mss_cache is current effective sending mss, including
 544   all tcp options except for SACKs. It is evaluated,
 545   taking into account current pmtu, but never exceeds
 546   tp->mss_clamp.
 547
 548   NOTE1. rfc1122 clearly states that advertised MSS
 549   DOES NOT include either tcp or ip options.
 550
 551   NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
 552   this function.                       --ANK (980731)
 553 */
 554
 555int tcp_sync_mss(struct sock *sk, u32 pmtu)
 556{
 557        struct tcp_opt *tp = tcp_sk(sk);
 558        struct dst_entry *dst = __sk_dst_get(sk);
 559        int mss_now;
 560
 561        if (dst && dst->ops->get_mss)
 562                pmtu = dst->ops->get_mss(dst, pmtu);
 563
 564        /* Calculate base mss without TCP options:
 565           It is MMS_S - sizeof(tcphdr) of rfc1122
 566         */
 567        mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
 568
 569        /* Clamp it (mss_clamp does not include tcp options) */
 570        if (mss_now > tp->mss_clamp)
 571                mss_now = tp->mss_clamp;
 572
 573        /* Now subtract optional transport overhead */
 574        mss_now -= tp->ext_header_len + tp->ext2_header_len;
 575
 576        /* Then reserve room for full set of TCP options and 8 bytes of data */
 577        if (mss_now < 48)
 578                mss_now = 48;
 579
 580        /* Now subtract TCP options size, not including SACKs */
 581        mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
 582
 583        /* Bound mss with half of window */
 584        if (tp->max_window && mss_now > (tp->max_window>>1))
 585                mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
 586
 587        /* And store cached results */
 588        tp->pmtu_cookie = pmtu;
 589        tp->mss_cache = tp->mss_cache_std = mss_now;
 590
 591        if (sk->sk_route_caps & NETIF_F_TSO) {
 592                int large_mss;
 593
 594                large_mss = 65535 - tp->af_specific->net_header_len -
 595                        tp->ext_header_len - tp->ext2_header_len - tp->tcp_header_len;
 596
 597                if (tp->max_window && large_mss > (tp->max_window>>1))
 598                        large_mss = max((tp->max_window>>1), 68U - tp->tcp_header_len);
 599
 600                /* Always keep large mss multiple of real mss. */
 601                tp->mss_cache = mss_now*(large_mss/mss_now);
 602        }
 603
 604        return mss_now;
 605}
 606
 607
 608/* This routine writes packets to the network.  It advances the
 609 * send_head.  This happens as incoming acks open up the remote
 610 * window for us.
 611 *
 612 * Returns 1, if no segments are in flight and we have queued segments, but
 613 * cannot send anything now because of SWS or another problem.
 614 */
 615int tcp_write_xmit(struct sock *sk, int nonagle)
 616{
 617        struct tcp_opt *tp = tcp_sk(sk);
 618        unsigned int mss_now;
 619
 620        /* If we are closed, the bytes will have to remain here.
 621         * In time closedown will finish, we empty the write queue and all
 622         * will be happy.
 623         */
 624        if (sk->sk_state != TCP_CLOSE) {
 625                struct sk_buff *skb;
 626                int sent_pkts = 0;
 627
 628                /* Account for SACKS, we may need to fragment due to this.
 629                 * It is just like the real MSS changing on us midstream.
 630                 * We also handle things correctly when the user adds some
 631                 * IP options mid-stream.  Silly to do, but cover it.
 632                 */
 633                mss_now = tcp_current_mss(sk, 1);
 634
 635                while((skb = tp->send_head) &&
 636                      tcp_snd_test(tp, skb, mss_now, tcp_skb_is_last(sk, skb) ? nonagle : TCP_NAGLE_PUSH)) {
 637                        if (skb->len > mss_now) {
 638                                if (tcp_fragment(sk, skb, mss_now))
 639                                        break;
 640                        }
 641
 642                        TCP_SKB_CB(skb)->when = tcp_time_stamp;
 643                        if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
 644                                break;
 645                        /* Advance the send_head.  This one is sent out. */
 646                        update_send_head(sk, tp, skb);
 647                        tcp_minshall_update(tp, mss_now, skb);
 648                        sent_pkts = 1;
 649                }
 650
 651                if (sent_pkts) {
 652                        tcp_cwnd_validate(sk, tp);
 653                        return 0;
 654                }
 655
 656                return !tp->packets_out && tp->send_head;
 657        }
 658        return 0;
 659}
 660
 661/* This function returns the amount that we can raise the
 662 * usable window based on the following constraints
 663 *  
 664 * 1. The window can never be shrunk once it is offered (RFC 793)
 665 * 2. We limit memory per socket
 666 *
 667 * RFC 1122:
 668 * "the suggested [SWS] avoidance algorithm for the receiver is to keep
 669 *  RECV.NEXT + RCV.WIN fixed until:
 670 *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
 671 *
 672 * i.e. don't raise the right edge of the window until you can raise
 673 * it at least MSS bytes.
 674 *
 675 * Unfortunately, the recommended algorithm breaks header prediction,
 676 * since header prediction assumes th->window stays fixed.
 677 *
 678 * Strictly speaking, keeping th->window fixed violates the receiver
 679 * side SWS prevention criteria. The problem is that under this rule
 680 * a stream of single byte packets will cause the right side of the
 681 * window to always advance by a single byte.
 682 * 
 683 * Of course, if the sender implements sender side SWS prevention
 684 * then this will not be a problem.
 685 * 
 686 * BSD seems to make the following compromise:
 687 * 
 688 *      If the free space is less than the 1/4 of the maximum
 689 *      space available and the free space is less than 1/2 mss,
 690 *      then set the window to 0.
 691 *      [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
 692 *      Otherwise, just prevent the window from shrinking
 693 *      and from being larger than the largest representable value.
 694 *
 695 * This prevents incremental opening of the window in the regime
 696 * where TCP is limited by the speed of the reader side taking
 697 * data out of the TCP receive queue. It does nothing about
 698 * those cases where the window is constrained on the sender side
 699 * because the pipeline is full.
 700 *
 701 * BSD also seems to "accidentally" limit itself to windows that are a
 702 * multiple of MSS, at least until the free space gets quite small.
 703 * This would appear to be a side effect of the mbuf implementation.
 704 * Combining these two algorithms results in the observed behavior
 705 * of having a fixed window size at almost all times.
 706 *
 707 * Below we obtain similar behavior by forcing the offered window to
 708 * a multiple of the mss when it is feasible to do so.
 709 *
 710 * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
 711 * Regular options like TIMESTAMP are taken into account.
 712 */
 713u32 __tcp_select_window(struct sock *sk)
 714{
 715        struct tcp_opt *tp = tcp_sk(sk);
 716        /* MSS for the peer's data.  Previous verions used mss_clamp
 717         * here.  I don't know if the value based on our guesses
 718         * of peer's MSS is better for the performance.  It's more correct
 719         * but may be worse for the performance because of rcv_mss
 720         * fluctuations.  --SAW  1998/11/1
 721         */
 722        int mss = tp->ack.rcv_mss;
 723        int free_space = tcp_space(sk);
 724        int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
 725        int window;
 726
 727        if (mss > full_space)
 728                mss = full_space; 
 729
 730        if (free_space < full_space/2) {
 731                tp->ack.quick = 0;
 732
 733                if (tcp_memory_pressure)
 734                        tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
 735
 736                if (free_space < mss)
 737                        return 0;
 738        }
 739
 740        if (free_space > tp->rcv_ssthresh)
 741                free_space = tp->rcv_ssthresh;
 742
 743        /* Get the largest window that is a nice multiple of mss.
 744         * Window clamp already applied above.
 745         * If our current window offering is within 1 mss of the
 746         * free space we just keep it. This prevents the divide
 747         * and multiply from happening most of the time.
 748         * We also don't do any window rounding when the free space
 749         * is too small.
 750         */
 751        window = tp->rcv_wnd;
 752        if (window <= free_space - mss || window > free_space)
 753                window = (free_space/mss)*mss;
 754
 755        return window;
 756}
 757
 758/* Attempt to collapse two adjacent SKB's during retransmission. */
 759static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
 760{
 761        struct tcp_opt *tp = tcp_sk(sk);
 762        struct sk_buff *next_skb = skb->next;
 763
 764        /* The first test we must make is that neither of these two
 765         * SKB's are still referenced by someone else.
 766         */
 767        if(!skb_cloned(skb) && !skb_cloned(next_skb)) {
 768                int skb_size = skb->len, next_skb_size = next_skb->len;
 769                u16 flags = TCP_SKB_CB(skb)->flags;
 770
 771                /* Also punt if next skb has been SACK'd. */
 772                if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
 773                        return;
 774
 775                /* Next skb is out of window. */
 776                if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd))
 777                        return;
 778
 779                /* Punt if not enough space exists in the first SKB for
 780                 * the data in the second, or the total combined payload
 781                 * would exceed the MSS.
 782                 */
 783                if ((next_skb_size > skb_tailroom(skb)) ||
 784                    ((skb_size + next_skb_size) > mss_now))
 785                        return;
 786
 787                /* Ok.  We will be able to collapse the packet. */
 788                __skb_unlink(next_skb, next_skb->list);
 789
 790                memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
 791
 792                if (next_skb->ip_summed == CHECKSUM_HW)
 793                        skb->ip_summed = CHECKSUM_HW;
 794
 795                if (skb->ip_summed != CHECKSUM_HW)
 796                        skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
 797
 798                /* Update sequence range on original skb. */
 799                TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
 800
 801                /* Merge over control information. */
 802                flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
 803                TCP_SKB_CB(skb)->flags = flags;
 804
 805                /* All done, get rid of second SKB and account for it so
 806                 * packet counting does not break.
 807                 */
 808                TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
 809                if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
 810                        tp->retrans_out--;
 811                if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) {
 812                        tp->lost_out--;
 813                        tp->left_out--;
 814                }
 815                /* Reno case is special. Sigh... */
 816                if (!tp->sack_ok && tp->sacked_out) {
 817                        tp->sacked_out--;
 818                        tp->left_out--;
 819                }
 820
 821                /* Not quite right: it can be > snd.fack, but
 822                 * it is better to underestimate fackets.
 823                 */
 824                if (tp->fackets_out)
 825                        tp->fackets_out--;
 826                tcp_free_skb(sk, next_skb);
 827                tp->packets_out--;
 828        }
 829}
 830
 831/* Do a simple retransmit without using the backoff mechanisms in
 832 * tcp_timer. This is used for path mtu discovery. 
 833 * The socket is already locked here.
 834 */ 
 835void tcp_simple_retransmit(struct sock *sk)
 836{
 837        struct tcp_opt *tp = tcp_sk(sk);
 838        struct sk_buff *skb;
 839        unsigned int mss = tcp_current_mss(sk, 0);
 840        int lost = 0;
 841
 842        for_retrans_queue(skb, sk, tp) {
 843                if (skb->len > mss && 
 844                    !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
 845                        if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
 846                                TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
 847                                tp->retrans_out--;
 848                        }
 849                        if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {
 850                                TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
 851                                tp->lost_out++;
 852                                lost = 1;
 853                        }
 854                }
 855        }
 856
 857        if (!lost)
 858                return;
 859
 860        tcp_sync_left_out(tp);
 861
 862        /* Don't muck with the congestion window here.
 863         * Reason is that we do not increase amount of _data_
 864         * in network, but units changed and effective
 865         * cwnd/ssthresh really reduced now.
 866         */
 867        if (tp->ca_state != TCP_CA_Loss) {
 868                tp->high_seq = tp->snd_nxt;
 869                tp->snd_ssthresh = tcp_current_ssthresh(tp);
 870                tp->prior_ssthresh = 0;
 871                tp->undo_marker = 0;
 872                tp->ca_state = TCP_CA_Loss;
 873        }
 874        tcp_xmit_retransmit_queue(sk);
 875}
 876
 877/* This retransmits one SKB.  Policy decisions and retransmit queue
 878 * state updates are done by the caller.  Returns non-zero if an
 879 * error occurred which prevented the send.
 880 */
 881int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 882{
 883        struct tcp_opt *tp = tcp_sk(sk);
 884        unsigned int cur_mss = tcp_current_mss(sk, 0);
 885        int err;
 886
 887        /* Do not sent more than we queued. 1/4 is reserved for possible
 888         * copying overhead: frgagmentation, tunneling, mangling etc.
 889         */
 890        if (atomic_read(&sk->sk_wmem_alloc) >
 891            min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
 892                return -EAGAIN;
 893
 894        if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
 895                if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
 896                        BUG();
 897
 898                if (sk->sk_route_caps & NETIF_F_TSO) {
 899                        sk->sk_route_caps &= ~NETIF_F_TSO;
 900                        sk->sk_no_largesend = 1;
 901                        tp->mss_cache = tp->mss_cache_std;
 902                }
 903
 904                if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
 905                        return -ENOMEM;
 906        }
 907
 908        /* If receiver has shrunk his window, and skb is out of
 909         * new window, do not retransmit it. The exception is the
 910         * case, when window is shrunk to zero. In this case
 911         * our retransmit serves as a zero window probe.
 912         */
 913        if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)
 914            && TCP_SKB_CB(skb)->seq != tp->snd_una)
 915                return -EAGAIN;
 916
 917        if(skb->len > cur_mss) {
 918                if(tcp_fragment(sk, skb, cur_mss))
 919                        return -ENOMEM; /* We'll try again later. */
 920
 921                /* New SKB created, account for it. */
 922                tp->packets_out++;
 923        }
 924
 925        /* Collapse two adjacent packets if worthwhile and we can. */
 926        if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
 927           (skb->len < (cur_mss >> 1)) &&
 928           (skb->next != tp->send_head) &&
 929           (skb->next != (struct sk_buff *)&sk->sk_write_queue) &&
 930           (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) &&
 931           (sysctl_tcp_retrans_collapse != 0))
 932                tcp_retrans_try_collapse(sk, skb, cur_mss);
 933
 934        if(tp->af_specific->rebuild_header(sk))
 935                return -EHOSTUNREACH; /* Routing failure or similar. */
 936
 937        /* Some Solaris stacks overoptimize and ignore the FIN on a
 938         * retransmit when old data is attached.  So strip it off
 939         * since it is cheap to do so and saves bytes on the network.
 940         */
 941        if(skb->len > 0 &&
 942           (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
 943           tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
 944                if (!pskb_trim(skb, 0)) {
 945                        TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
 946                        skb->ip_summed = CHECKSUM_NONE;
 947                        skb->csum = 0;
 948                }
 949        }
 950
 951        /* Make a copy, if the first transmission SKB clone we made
 952         * is still in somebody's hands, else make a clone.
 953         */
 954        TCP_SKB_CB(skb)->when = tcp_time_stamp;
 955
 956        err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
 957                                    pskb_copy(skb, GFP_ATOMIC):
 958                                    skb_clone(skb, GFP_ATOMIC)));
 959
 960        if (err == 0) {
 961                /* Update global TCP statistics. */
 962                TCP_INC_STATS(TcpRetransSegs);
 963
 964#if FASTRETRANS_DEBUG > 0
 965                if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
 966                        if (net_ratelimit())
 967                                printk(KERN_DEBUG "retrans_out leaked.\n");
 968                }
 969#endif
 970                TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
 971                tp->retrans_out++;
 972
 973                /* Save stamp of the first retransmit. */
 974                if (!tp->retrans_stamp)
 975                        tp->retrans_stamp = TCP_SKB_CB(skb)->when;
 976
 977                tp->undo_retrans++;
 978
 979                /* snd_nxt is stored to detect loss of retransmitted segment,
 980                 * see tcp_input.c tcp_sacktag_write_queue().
 981                 */
 982                TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
 983        }
 984        return err;
 985}
 986
 987/* This gets called after a retransmit timeout, and the initially
 988 * retransmitted data is acknowledged.  It tries to continue
 989 * resending the rest of the retransmit queue, until either
 990 * we've sent it all or the congestion window limit is reached.
 991 * If doing SACK, the first ACK which comes back for a timeout
 992 * based retransmit packet might feed us FACK information again.
 993 * If so, we use it to avoid unnecessarily retransmissions.
 994 */
 995void tcp_xmit_retransmit_queue(struct sock *sk)
 996{
 997        struct tcp_opt *tp = tcp_sk(sk);
 998        struct sk_buff *skb;
 999        int packet_cnt = tp->lost_out;
1000
1001        /* First pass: retransmit lost packets. */
1002        if (packet_cnt) {
1003                for_retrans_queue(skb, sk, tp) {
1004                        __u8 sacked = TCP_SKB_CB(skb)->sacked;
1005
1006                        if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
1007                                return;
1008
1009                        if (sacked&TCPCB_LOST) {
1010                                if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
1011                                        if (tcp_retransmit_skb(sk, skb))
1012                                                return;
1013                                        if (tp->ca_state != TCP_CA_Loss)
1014                                                NET_INC_STATS_BH(TCPFastRetrans);
1015                                        else
1016                                                NET_INC_STATS_BH(TCPSlowStartRetrans);
1017
1018                                        if (skb ==
1019                                            skb_peek(&sk->sk_write_queue))
1020                                                tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1021                                }
1022
1023                                if (--packet_cnt <= 0)
1024                                        break;
1025                        }
1026                }
1027        }
1028
1029        /* OK, demanded retransmission is finished. */
1030
1031        /* Forward retransmissions are possible only during Recovery. */
1032        if (tp->ca_state != TCP_CA_Recovery)
1033                return;
1034
1035        /* No forward retransmissions in Reno are possible. */
1036        if (!tp->sack_ok)
1037                return;
1038
1039        /* Yeah, we have to make difficult choice between forward transmission
1040         * and retransmission... Both ways have their merits...
1041         *
1042         * For now we do not retrnamsit anything, while we have some new
1043         * segments to send.
1044         */
1045
1046        if (tcp_may_send_now(sk, tp))
1047                return;
1048
1049        packet_cnt = 0;
1050
1051        for_retrans_queue(skb, sk, tp) {
1052                if(++packet_cnt > tp->fackets_out)
1053                        break;
1054
1055                if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
1056                        break;
1057
1058                if(TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
1059                        continue;
1060
1061                /* Ok, retransmit it. */
1062                if(tcp_retransmit_skb(sk, skb))
1063                        break;
1064
1065                if (skb == skb_peek(&sk->sk_write_queue))
1066                        tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1067
1068                NET_INC_STATS_BH(TCPForwardRetrans);
1069        }
1070}
1071
1072
1073/* Send a fin.  The caller locks the socket for us.  This cannot be
1074 * allowed to fail queueing a FIN frame under any circumstances.
1075 */
1076void tcp_send_fin(struct sock *sk)
1077{
1078        struct tcp_opt *tp = tcp_sk(sk);        
1079        struct sk_buff *skb = skb_peek_tail(&sk->sk_write_queue);
1080        unsigned int mss_now;
1081        
1082        /* Optimization, tack on the FIN if we have a queue of
1083         * unsent frames.  But be careful about outgoing SACKS
1084         * and IP options.
1085         */
1086        mss_now = tcp_current_mss(sk, 1); 
1087
1088        if(tp->send_head != NULL) {
1089                TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
1090                TCP_SKB_CB(skb)->end_seq++;
1091                tp->write_seq++;
1092        } else {
1093                /* Socket is locked, keep trying until memory is available. */
1094                for (;;) {
1095                        skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
1096                        if (skb)
1097                                break;
1098                        yield();
1099                }
1100
1101                /* Reserve space for headers and prepare control bits. */
1102                skb_reserve(skb, MAX_TCP_HEADER);
1103                skb->csum = 0;
1104                TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
1105                TCP_SKB_CB(skb)->sacked = 0;
1106
1107                /* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */
1108                TCP_SKB_CB(skb)->seq = tp->write_seq;
1109                TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1110                tcp_send_skb(sk, skb, 1, mss_now);
1111        }
1112        __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_OFF);
1113}
1114
1115/* We get here when a process closes a file descriptor (either due to
1116 * an explicit close() or as a byproduct of exit()'ing) and there
1117 * was unread data in the receive queue.  This behavior is recommended
1118 * by draft-ietf-tcpimpl-prob-03.txt section 3.10.  -DaveM
1119 */
1120void tcp_send_active_reset(struct sock *sk, int priority)
1121{
1122        struct tcp_opt *tp = tcp_sk(sk);
1123        struct sk_buff *skb;
1124
1125        /* NOTE: No TCP options attached and we never retransmit this. */
1126        skb = alloc_skb(MAX_TCP_HEADER, priority);
1127        if (!skb) {
1128                NET_INC_STATS(TCPAbortFailed);
1129                return;
1130        }
1131
1132        /* Reserve space for headers and prepare control bits. */
1133        skb_reserve(skb, MAX_TCP_HEADER);
1134        skb->csum = 0;
1135        TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
1136        TCP_SKB_CB(skb)->sacked = 0;
1137
1138        /* Send it off. */
1139        TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
1140        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1141        TCP_SKB_CB(skb)->when = tcp_time_stamp;
1142        if (tcp_transmit_skb(sk, skb))
1143                NET_INC_STATS(TCPAbortFailed);
1144}
1145
1146/* WARNING: This routine must only be called when we have already sent
1147 * a SYN packet that crossed the incoming SYN that caused this routine
1148 * to get called. If this assumption fails then the initial rcv_wnd
1149 * and rcv_wscale values will not be correct.
1150 */
1151int tcp_send_synack(struct sock *sk)
1152{
1153        struct sk_buff* skb;
1154
1155        skb = skb_peek(&sk->sk_write_queue);
1156        if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) {
1157                printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
1158                return -EFAULT;
1159        }
1160        if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) {
1161                if (skb_cloned(skb)) {
1162                        struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
1163                        if (nskb == NULL)
1164                                return -ENOMEM;
1165                        __skb_unlink(skb, &sk->sk_write_queue);
1166                        __skb_queue_head(&sk->sk_write_queue, nskb);
1167                        tcp_free_skb(sk, skb);
1168                        tcp_charge_skb(sk, nskb);
1169                        skb = nskb;
1170                }
1171
1172                TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
1173                TCP_ECN_send_synack(tcp_sk(sk), skb);
1174        }
1175        TCP_SKB_CB(skb)->when = tcp_time_stamp;
1176        return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1177}
1178
1179/*
1180 * Prepare a SYN-ACK.
1181 */
1182struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
1183                                 struct open_request *req)
1184{
1185        struct tcp_opt *tp = tcp_sk(sk);
1186        struct tcphdr *th;
1187        int tcp_header_size;
1188        struct sk_buff *skb;
1189
1190        skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
1191        if (skb == NULL)
1192                return NULL;
1193
1194        /* Reserve space for headers. */
1195        skb_reserve(skb, MAX_TCP_HEADER);
1196
1197        skb->dst = dst_clone(dst);
1198
1199        tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
1200                           (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
1201                           (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
1202                           /* SACK_PERM is in the place of NOP NOP of TS */
1203                           ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
1204        skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
1205
1206        memset(th, 0, sizeof(struct tcphdr));
1207        th->syn = 1;
1208        th->ack = 1;
1209        if (dst->dev->features&NETIF_F_TSO)
1210                req->ecn_ok = 0;
1211        TCP_ECN_make_synack(req, th);
1212        th->source = inet_sk(sk)->sport;
1213        th->dest = req->rmt_port;
1214        TCP_SKB_CB(skb)->seq = req->snt_isn;
1215        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1216        th->seq = htonl(TCP_SKB_CB(skb)->seq);
1217        th->ack_seq = htonl(req->rcv_isn + 1);
1218        if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
1219                __u8 rcv_wscale; 
1220                /* Set this up on the first call only */
1221                req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
1222                /* tcp_full_space because it is guaranteed to be the first packet */
1223                tcp_select_initial_window(tcp_full_space(sk), 
1224                        dst_metric(dst, RTAX_ADVMSS) - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
1225                        &req->rcv_wnd,
1226                        &req->window_clamp,
1227                        req->wscale_ok,
1228                        &rcv_wscale);
1229                req->rcv_wscale = rcv_wscale; 
1230        }
1231
1232        /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
1233        th->window = htons(req->rcv_wnd);
1234
1235        TCP_SKB_CB(skb)->when = tcp_time_stamp;
1236        tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), req->tstamp_ok,
1237                              req->sack_ok, req->wscale_ok, req->rcv_wscale,
1238                              TCP_SKB_CB(skb)->when,
1239                              req->ts_recent);
1240
1241        skb->csum = 0;
1242        th->doff = (tcp_header_size >> 2);
1243        TCP_INC_STATS(TcpOutSegs);
1244        return skb;
1245}
1246
1247/* 
1248 * Do all connect socket setups that can be done AF independent.
1249 */ 
1250static inline void tcp_connect_init(struct sock *sk)
1251{
1252        struct dst_entry *dst = __sk_dst_get(sk);
1253        struct tcp_opt *tp = tcp_sk(sk);
1254
1255        /* We'll fix this up when we get a response from the other end.
1256         * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
1257         */
1258        tp->tcp_header_len = sizeof(struct tcphdr) +
1259                (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
1260
1261        /* If user gave his TCP_MAXSEG, record it to clamp */
1262        if (tp->user_mss)
1263                tp->mss_clamp = tp->user_mss;
1264        tp->max_window = 0;
1265        tcp_sync_mss(sk, dst_pmtu(dst));
1266
1267        if (!tp->window_clamp)
1268                tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
1269        tp->advmss = dst_metric(dst, RTAX_ADVMSS);
1270        tcp_initialize_rcv_mss(sk);
1271
1272        tcp_select_initial_window(tcp_full_space(sk),
1273                                  tp->advmss - (tp->ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
1274                                  &tp->rcv_wnd,
1275                                  &tp->window_clamp,
1276                                  sysctl_tcp_window_scaling,
1277                                  &tp->rcv_wscale);
1278
1279        tp->rcv_ssthresh = tp->rcv_wnd;
1280
1281        sk->sk_err = 0;
1282        sock_reset_flag(sk, SOCK_DONE);
1283        tp->snd_wnd = 0;
1284        tcp_init_wl(tp, tp->write_seq, 0);
1285        tp->snd_una = tp->write_seq;
1286        tp->snd_sml = tp->write_seq;
1287        tp->rcv_nxt = 0;
1288        tp->rcv_wup = 0;
1289        tp->copied_seq = 0;
1290
1291        tp->rto = TCP_TIMEOUT_INIT;
1292        tp->retransmits = 0;
1293        tcp_clear_retrans(tp);
1294}
1295
1296/*
1297 * Build a SYN and send it off.
1298 */ 
1299int tcp_connect(struct sock *sk)
1300{
1301        struct tcp_opt *tp = tcp_sk(sk);
1302        struct sk_buff *buff;
1303
1304        tcp_connect_init(sk);
1305
1306        buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation);
1307        if (unlikely(buff == NULL))
1308                return -ENOBUFS;
1309
1310        /* Reserve space for headers. */
1311        skb_reserve(buff, MAX_TCP_HEADER);
1312
1313        TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
1314        TCP_ECN_send_syn(sk, tp, buff);
1315        TCP_SKB_CB(buff)->sacked = 0;
1316        buff->csum = 0;
1317        TCP_SKB_CB(buff)->seq = tp->write_seq++;
1318        TCP_SKB_CB(buff)->end_seq = tp->write_seq;
1319        tp->snd_nxt = tp->write_seq;
1320        tp->pushed_seq = tp->write_seq;
1321
1322        /* Send it off. */
1323        TCP_SKB_CB(buff)->when = tcp_time_stamp;
1324        tp->retrans_stamp = TCP_SKB_CB(buff)->when;
1325        __skb_queue_tail(&sk->sk_write_queue, buff);
1326        tcp_charge_skb(sk, buff);
1327        tp->packets_out++;
1328        tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
1329        TCP_INC_STATS(TcpActiveOpens);
1330
1331        /* Timer for repeating the SYN until an answer. */
1332        tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1333        return 0;
1334}
1335
1336/* Send out a delayed ack, the caller does the policy checking
1337 * to see if we should even be here.  See tcp_input.c:tcp_ack_snd_check()
1338 * for details.
1339 */
1340void tcp_send_delayed_ack(struct sock *sk)
1341{
1342        struct tcp_opt *tp = tcp_sk(sk);
1343        int ato = tp->ack.ato;
1344        unsigned long timeout;
1345
1346        if (ato > TCP_DELACK_MIN) {
1347                int max_ato = HZ/2;
1348
1349                if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
1350                        max_ato = TCP_DELACK_MAX;
1351
1352                /* Slow path, intersegment interval is "high". */
1353
1354                /* If some rtt estimate is known, use it to bound delayed ack.
1355                 * Do not use tp->rto here, use results of rtt measurements
1356                 * directly.
1357                 */
1358                if (tp->srtt) {
1359                        int rtt = max(tp->srtt>>3, TCP_DELACK_MIN);
1360
1361                        if (rtt < max_ato)
1362                                max_ato = rtt;
1363                }
1364
1365                ato = min(ato, max_ato);
1366        }
1367
1368        /* Stay within the limit we were given */
1369        timeout = jiffies + ato;
1370
1371        /* Use new timeout only if there wasn't a older one earlier. */
1372        if (tp->ack.pending&TCP_ACK_TIMER) {
1373                /* If delack timer was blocked or is about to expire,
1374                 * send ACK now.
1375                 */
1376                if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
1377                        tcp_send_ack(sk);
1378                        return;
1379                }
1380
1381                if (!time_before(timeout, tp->ack.timeout))
1382                        timeout = tp->ack.timeout;
1383        }
1384        tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
1385        tp->ack.timeout = timeout;
1386        if (!mod_timer(&tp->delack_timer, timeout))
1387                sock_hold(sk);
1388}
1389
1390/* This routine sends an ack and also updates the window. */
1391void tcp_send_ack(struct sock *sk)
1392{
1393        /* If we have been reset, we may not send again. */
1394        if (sk->sk_state != TCP_CLOSE) {
1395                struct tcp_opt *tp = tcp_sk(sk);
1396                struct sk_buff *buff;
1397
1398                /* We are not putting this on the write queue, so
1399                 * tcp_transmit_skb() will set the ownership to this
1400                 * sock.
1401                 */
1402                buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1403                if (buff == NULL) {
1404                        tcp_schedule_ack(tp);
1405                        tp->ack.ato = TCP_ATO_MIN;
1406                        tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
1407                        return;
1408                }
1409
1410                /* Reserve space for headers and prepare control bits. */
1411                skb_reserve(buff, MAX_TCP_HEADER);
1412                buff->csum = 0;
1413                TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
1414                TCP_SKB_CB(buff)->sacked = 0;
1415
1416                /* Send it off, this clears delayed acks for us. */
1417                TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
1418                TCP_SKB_CB(buff)->when = tcp_time_stamp;
1419                tcp_transmit_skb(sk, buff);
1420        }
1421}
1422
1423/* This routine sends a packet with an out of date sequence
1424 * number. It assumes the other end will try to ack it.
1425 *
1426 * Question: what should we make while urgent mode?
1427 * 4.4BSD forces sending single byte of data. We cannot send
1428 * out of window data, because we have SND.NXT==SND.MAX...
1429 *
1430 * Current solution: to send TWO zero-length segments in urgent mode:
1431 * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
1432 * out-of-date with SND.UNA-1 to probe window.
1433 */
1434static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
1435{
1436        struct tcp_opt *tp = tcp_sk(sk);
1437        struct sk_buff *skb;
1438
1439        /* We don't queue it, tcp_transmit_skb() sets ownership. */
1440        skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1441        if (skb == NULL) 
1442                return -1;
1443
1444        /* Reserve space for headers and set control bits. */
1445        skb_reserve(skb, MAX_TCP_HEADER);
1446        skb->csum = 0;
1447        TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
1448        TCP_SKB_CB(skb)->sacked = urgent;
1449
1450        /* Use a previous sequence.  This should cause the other
1451         * end to send an ack.  Don't queue or clone SKB, just
1452         * send it.
1453         */
1454        TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
1455        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1456        TCP_SKB_CB(skb)->when = tcp_time_stamp;
1457        return tcp_transmit_skb(sk, skb);
1458}
1459
1460int tcp_write_wakeup(struct sock *sk)
1461{
1462        if (sk->sk_state != TCP_CLOSE) {
1463                struct tcp_opt *tp = tcp_sk(sk);
1464                struct sk_buff *skb;
1465
1466                if ((skb = tp->send_head) != NULL &&
1467                    before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
1468                        int err;
1469                        int mss = tcp_current_mss(sk, 0);
1470                        int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq;
1471
1472                        if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
1473                                tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
1474
1475                        /* We are probing the opening of a window
1476                         * but the window size is != 0
1477                         * must have been a result SWS avoidance ( sender )
1478                         */
1479                        if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
1480                            skb->len > mss) {
1481                                seg_size = min(seg_size, mss);
1482                                TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1483                                if (tcp_fragment(sk, skb, seg_size))
1484                                        return -1;
1485                                /* SWS override triggered forced fragmentation.
1486                                 * Disable TSO, the connection is too sick. */
1487                                if (sk->sk_route_caps & NETIF_F_TSO) {
1488                                        sk->sk_no_largesend = 1;
1489                                        sk->sk_route_caps &= ~NETIF_F_TSO;
1490                                        tp->mss_cache = tp->mss_cache_std;
1491                                }
1492                        }
1493                        TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1494                        TCP_SKB_CB(skb)->when = tcp_time_stamp;
1495                        err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1496                        if (!err) {
1497                                update_send_head(sk, tp, skb);
1498                        }
1499                        return err;
1500                } else {
1501                        if (tp->urg_mode &&
1502                            between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF))
1503                                tcp_xmit_probe_skb(sk, TCPCB_URG);
1504                        return tcp_xmit_probe_skb(sk, 0);
1505                }
1506        }
1507        return -1;
1508}
1509
1510/* A window probe timeout has occurred.  If window is not closed send
1511 * a partial packet else a zero probe.
1512 */
1513void tcp_send_probe0(struct sock *sk)
1514{
1515        struct tcp_opt *tp = tcp_sk(sk);
1516        int err;
1517
1518        err = tcp_write_wakeup(sk);
1519
1520        if (tp->packets_out || !tp->send_head) {
1521                /* Cancel probe timer, if it is not required. */
1522                tp->probes_out = 0;
1523                tp->backoff = 0;
1524                return;
1525        }
1526
1527        if (err <= 0) {
1528                if (tp->backoff < sysctl_tcp_retries2)
1529                        tp->backoff++;
1530                tp->probes_out++;
1531                tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 
1532                                      min(tp->rto << tp->backoff, TCP_RTO_MAX));
1533        } else {
1534                /* If packet was not sent due to local congestion,
1535                 * do not backoff and do not remember probes_out.
1536                 * Let local senders to fight for local resources.
1537                 *
1538                 * Use accumulated backoff yet.
1539                 */
1540                if (!tp->probes_out)
1541                        tp->probes_out=1;
1542                tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 
1543                                      min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
1544        }
1545}
1546
1547EXPORT_SYMBOL(tcp_acceptable_seq);
1548EXPORT_SYMBOL(tcp_connect);
1549EXPORT_SYMBOL(tcp_connect_init);
1550EXPORT_SYMBOL(tcp_make_synack);
1551EXPORT_SYMBOL(tcp_send_synack);
1552EXPORT_SYMBOL(tcp_simple_retransmit);
1553EXPORT_SYMBOL(tcp_sync_mss);
1554EXPORT_SYMBOL(tcp_transmit_skb);
1555EXPORT_SYMBOL(tcp_write_wakeup);
1556EXPORT_SYMBOL(tcp_write_xmit);
1557
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.