linux-bk/net/ipv4/tcp.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
   9 *
  10 * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14 *              Florian La Roche, <flla@stud.uni-sb.de>
  15 *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16 *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18 *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19 *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20 *              Jorge Cwik, <jorge@laser.satlink.net>
  21 *
  22 * Fixes:
  23 *              Alan Cox        :       Numerous verify_area() calls
  24 *              Alan Cox        :       Set the ACK bit on a reset
  25 *              Alan Cox        :       Stopped it crashing if it closed while
  26 *                                      sk->inuse=1 and was trying to connect
  27 *                                      (tcp_err()).
  28 *              Alan Cox        :       All icmp error handling was broken
  29 *                                      pointers passed where wrong and the
  30 *                                      socket was looked up backwards. Nobody
  31 *                                      tested any icmp error code obviously.
  32 *              Alan Cox        :       tcp_err() now handled properly. It
  33 *                                      wakes people on errors. poll
  34 *                                      behaves and the icmp error race
  35 *                                      has gone by moving it into sock.c
  36 *              Alan Cox        :       tcp_send_reset() fixed to work for
  37 *                                      everything not just packets for
  38 *                                      unknown sockets.
  39 *              Alan Cox        :       tcp option processing.
  40 *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  41 *                                      syn rule wrong]
  42 *              Herp Rosmanith  :       More reset fixes
  43 *              Alan Cox        :       No longer acks invalid rst frames.
  44 *                                      Acking any kind of RST is right out.
  45 *              Alan Cox        :       Sets an ignore me flag on an rst
  46 *                                      receive otherwise odd bits of prattle
  47 *                                      escape still
  48 *              Alan Cox        :       Fixed another acking RST frame bug.
  49 *                                      Should stop LAN workplace lockups.
  50 *              Alan Cox        :       Some tidyups using the new skb list
  51 *                                      facilities
  52 *              Alan Cox        :       sk->keepopen now seems to work
  53 *              Alan Cox        :       Pulls options out correctly on accepts
  54 *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55 *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56 *                                      bit to skb ops.
  57 *              Alan Cox        :       Tidied tcp_data to avoid a potential
  58 *                                      nasty.
  59 *              Alan Cox        :       Added some better commenting, as the
  60 *                                      tcp is hard to follow
  61 *              Alan Cox        :       Removed incorrect check for 20 * psh
  62 *      Michael O'Reilly        :       ack < copied bug fix.
  63 *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64 *              Alan Cox        :       FIN with no memory -> CRASH
  65 *              Alan Cox        :       Added socket option proto entries.
  66 *                                      Also added awareness of them to accept.
  67 *              Alan Cox        :       Added TCP options (SOL_TCP)
  68 *              Alan Cox        :       Switched wakeup calls to callbacks,
  69 *                                      so the kernel can layer network
  70 *                                      sockets.
  71 *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72 *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73 *              Alan Cox        :       RST frames sent on unsynchronised
  74 *                                      state ack error.
  75 *              Alan Cox        :       Put in missing check for SYN bit.
  76 *              Alan Cox        :       Added tcp_select_window() aka NET2E
  77 *                                      window non shrink trick.
  78 *              Alan Cox        :       Added a couple of small NET2E timer
  79 *                                      fixes
  80 *              Charles Hedrick :       TCP fixes
  81 *              Toomas Tamm     :       TCP window fixes
  82 *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83 *              Charles Hedrick :       Rewrote most of it to actually work
  84 *              Linus           :       Rewrote tcp_read() and URG handling
  85 *                                      completely
  86 *              Gerhard Koerting:       Fixed some missing timer handling
  87 *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88 *              Gerhard Koerting:       PC/TCP workarounds
  89 *              Adam Caldwell   :       Assorted timer/timing errors
  90 *              Matthew Dillon  :       Fixed another RST bug
  91 *              Alan Cox        :       Move to kernel side addressing changes.
  92 *              Alan Cox        :       Beginning work on TCP fastpathing
  93 *                                      (not yet usable)
  94 *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95 *              Alan Cox        :       TCP fast path debugging
  96 *              Alan Cox        :       Window clamping
  97 *              Michael Riepe   :       Bug in tcp_check()
  98 *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99 *              Matt Dillon     :       Yet more small nasties remove from the
 100 *                                      TCP code (Be very nice to this man if
 101 *                                      tcp finally works 100%) 8)
 102 *              Alan Cox        :       BSD accept semantics.
 103 *              Alan Cox        :       Reset on closedown bug.
 104 *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105 *              Michael Pall    :       Handle poll() after URG properly in
 106 *                                      all cases.
 107 *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 108 *                                      (multi URG PUSH broke rlogin).
 109 *              Michael Pall    :       Fix the multi URG PUSH problem in
 110 *                                      tcp_readable(), poll() after URG
 111 *                                      works now.
 112 *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 113 *                                      BSD api.
 114 *              Alan Cox        :       Changed the semantics of sk->socket to
 115 *                                      fix a race and a signal problem with
 116 *                                      accept() and async I/O.
 117 *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118 *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119 *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120 *                                      clients/servers which listen in on
 121 *                                      fixed ports.
 122 *              Alan Cox        :       Cleaned the above up and shrank it to
 123 *                                      a sensible code size.
 124 *              Alan Cox        :       Self connect lockup fix.
 125 *              Alan Cox        :       No connect to multicast.
 126 *              Ross Biro       :       Close unaccepted children on master
 127 *                                      socket close.
 128 *              Alan Cox        :       Reset tracing code.
 129 *              Alan Cox        :       Spurious resets on shutdown.
 130 *              Alan Cox        :       Giant 15 minute/60 second timer error
 131 *              Alan Cox        :       Small whoops in polling before an
 132 *                                      accept.
 133 *              Alan Cox        :       Kept the state trace facility since
 134 *                                      it's handy for debugging.
 135 *              Alan Cox        :       More reset handler fixes.
 136 *              Alan Cox        :       Started rewriting the code based on
 137 *                                      the RFC's for other useful protocol
 138 *                                      references see: Comer, KA9Q NOS, and
 139 *                                      for a reference on the difference
 140 *                                      between specifications and how BSD
 141 *                                      works see the 4.4lite source.
 142 *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 143 *                                      close.
 144 *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145 *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146 *              Alan Cox        :       Reimplemented timers as per the RFC
 147 *                                      and using multiple timers for sanity.
 148 *              Alan Cox        :       Small bug fixes, and a lot of new
 149 *                                      comments.
 150 *              Alan Cox        :       Fixed dual reader crash by locking
 151 *                                      the buffers (much like datagram.c)
 152 *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153 *                                      now gets fed up of retrying without
 154 *                                      (even a no space) answer.
 155 *              Alan Cox        :       Extracted closing code better
 156 *              Alan Cox        :       Fixed the closing state machine to
 157 *                                      resemble the RFC.
 158 *              Alan Cox        :       More 'per spec' fixes.
 159 *              Jorge Cwik      :       Even faster checksumming.
 160 *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161 *                                      only frames. At least one pc tcp stack
 162 *                                      generates them.
 163 *              Alan Cox        :       Cache last socket.
 164 *              Alan Cox        :       Per route irtt.
 165 *              Matt Day        :       poll()->select() match BSD precisely on error
 166 *              Alan Cox        :       New buffers
 167 *              Marc Tamsky     :       Various sk->prot->retransmits and
 168 *                                      sk->retransmits misupdating fixed.
 169 *                                      Fixed tcp_write_timeout: stuck close,
 170 *                                      and TCP syn retries gets used now.
 171 *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172 *                                      ack if state is TCP_CLOSED.
 173 *              Alan Cox        :       Look up device on a retransmit - routes may
 174 *                                      change. Doesn't yet cope with MSS shrink right
 175 *                                      but it's a start!
 176 *              Marc Tamsky     :       Closing in closing fixes.
 177 *              Mike Shaver     :       RFC1122 verifications.
 178 *              Alan Cox        :       rcv_saddr errors.
 179 *              Alan Cox        :       Block double connect().
 180 *              Alan Cox        :       Small hooks for enSKIP.
 181 *              Alexey Kuznetsov:       Path MTU discovery.
 182 *              Alan Cox        :       Support soft errors.
 183 *              Alan Cox        :       Fix MTU discovery pathological case
 184 *                                      when the remote claims no mtu!
 185 *              Marc Tamsky     :       TCP_CLOSE fix.
 186 *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 187 *                                      window but wrong (fixes NT lpd problems)
 188 *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189 *              Joerg Reuter    :       No modification of locked buffers in
 190 *                                      tcp_do_retransmit()
 191 *              Eric Schenk     :       Changed receiver side silly window
 192 *                                      avoidance algorithm to BSD style
 193 *                                      algorithm. This doubles throughput
 194 *                                      against machines running Solaris,
 195 *                                      and seems to result in general
 196 *                                      improvement.
 197 *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 198 *      Willy Konynenberg       :       Transparent proxying support.
 199 *      Mike McLagan            :       Routing by source
 200 *              Keith Owens     :       Do proper merging with partial SKB's in
 201 *                                      tcp_do_sendmsg to avoid burstiness.
 202 *              Eric Schenk     :       Fix fast close down bug with
 203 *                                      shutdown() followed by close().
 204 *              Andi Kleen      :       Make poll agree with SIGIO
 205 *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
 206 *                                      lingertime == 0 (RFC 793 ABORT Call)
 207 *      Hirokazu Takahashi      :       Use copy_from_user() instead of
 208 *                                      csum_and_copy_from_user() if possible.
 209 *
 210 *              This program is free software; you can redistribute it and/or
 211 *              modify it under the terms of the GNU General Public License
 212 *              as published by the Free Software Foundation; either version
 213 *              2 of the License, or(at your option) any later version.
 214 *
 215 * Description of States:
 216 *
 217 *      TCP_SYN_SENT            sent a connection request, waiting for ack
 218 *
 219 *      TCP_SYN_RECV            received a connection request, sent ack,
 220 *                              waiting for final ack in three-way handshake.
 221 *
 222 *      TCP_ESTABLISHED         connection established
 223 *
 224 *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 225 *                              transmission of remaining buffered data
 226 *
 227 *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 228 *                              to shutdown
 229 *
 230 *      TCP_CLOSING             both sides have shutdown but we still have
 231 *                              data we have to finish sending
 232 *
 233 *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 234 *                              closed, can only be entered from FIN_WAIT2
 235 *                              or CLOSING.  Required because the other end
 236 *                              may not have gotten our last ACK causing it
 237 *                              to retransmit the data packet (which we ignore)
 238 *
 239 *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 240 *                              us to finish writing our data and to shutdown
 241 *                              (we have to close() to move on to LAST_ACK)
 242 *
 243 *      TCP_LAST_ACK            out side has shutdown after remote has
 244 *                              shutdown.  There may still be data in our
 245 *                              buffer that we have to finish sending
 246 *
 247 *      TCP_CLOSE               socket is finished
 248 */
 249
 250#include <linux/config.h>
 251#include <linux/module.h>
 252#include <linux/types.h>
 253#include <linux/fcntl.h>
 254#include <linux/poll.h>
 255#include <linux/init.h>
 256#include <linux/smp_lock.h>
 257#include <linux/fs.h>
 258#include <linux/random.h>
 259
 260#include <net/icmp.h>
 261#include <net/tcp.h>
 262#include <net/xfrm.h>
 263#include <net/ip.h>
 264
 265
 266#include <asm/uaccess.h>
 267#include <asm/ioctls.h>
 268
 269int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 270
 271DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
 272
 273kmem_cache_t *tcp_openreq_cachep;
 274kmem_cache_t *tcp_bucket_cachep;
 275kmem_cache_t *tcp_timewait_cachep;
 276
 277atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 278
 279int sysctl_tcp_mem[3];
 280int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
 281int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
 282
 283atomic_t tcp_memory_allocated;  /* Current allocated memory. */
 284atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
 285
 286/* Pressure flag: try to collapse.
 287 * Technical note: it is used by multiple contexts non atomically.
 288 * All the tcp_mem_schedule() is of this nature: accounting
 289 * is strict, actions are advisory and have some latency. */
 290int tcp_memory_pressure;
 291
 292#define TCP_PAGES(amt) (((amt) + TCP_MEM_QUANTUM - 1) / TCP_MEM_QUANTUM)
 293
 294int tcp_mem_schedule(struct sock *sk, int size, int kind)
 295{
 296        int amt = TCP_PAGES(size);
 297
 298        sk->sk_forward_alloc += amt * TCP_MEM_QUANTUM;
 299        atomic_add(amt, &tcp_memory_allocated);
 300
 301        /* Under limit. */
 302        if (atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
 303                if (tcp_memory_pressure)
 304                        tcp_memory_pressure = 0;
 305                return 1;
 306        }
 307
 308        /* Over hard limit. */
 309        if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) {
 310                tcp_enter_memory_pressure();
 311                goto suppress_allocation;
 312        }
 313
 314        /* Under pressure. */
 315        if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[1])
 316                tcp_enter_memory_pressure();
 317
 318        if (kind) {
 319                if (atomic_read(&sk->sk_rmem_alloc) < sysctl_tcp_rmem[0])
 320                        return 1;
 321        } else if (sk->sk_wmem_queued < sysctl_tcp_wmem[0])
 322                return 1;
 323
 324        if (!tcp_memory_pressure ||
 325            sysctl_tcp_mem[2] > atomic_read(&tcp_sockets_allocated) *
 326                                TCP_PAGES(sk->sk_wmem_queued +
 327                                          atomic_read(&sk->sk_rmem_alloc) +
 328                                          sk->sk_forward_alloc))
 329                return 1;
 330
 331suppress_allocation:
 332
 333        if (!kind) {
 334                tcp_moderate_sndbuf(sk);
 335
 336                /* Fail only if socket is _under_ its sndbuf.
 337                 * In this case we cannot block, so that we have to fail.
 338                 */
 339                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
 340                        return 1;
 341        }
 342
 343        /* Alas. Undo changes. */
 344        sk->sk_forward_alloc -= amt * TCP_MEM_QUANTUM;
 345        atomic_sub(amt, &tcp_memory_allocated);
 346        return 0;
 347}
 348
 349void __tcp_mem_reclaim(struct sock *sk)
 350{
 351        if (sk->sk_forward_alloc >= TCP_MEM_QUANTUM) {
 352                atomic_sub(sk->sk_forward_alloc / TCP_MEM_QUANTUM,
 353                           &tcp_memory_allocated);
 354                sk->sk_forward_alloc &= TCP_MEM_QUANTUM - 1;
 355                if (tcp_memory_pressure &&
 356                    atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
 357                        tcp_memory_pressure = 0;
 358        }
 359}
 360
 361void tcp_rfree(struct sk_buff *skb)
 362{
 363        struct sock *sk = skb->sk;
 364
 365        atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
 366        sk->sk_forward_alloc += skb->truesize;
 367}
 368
 369/*
 370 * LISTEN is a special case for poll..
 371 */
 372static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
 373                                               poll_table *wait)
 374{
 375        return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
 376}
 377
 378/*
 379 *      Wait for a TCP event.
 380 *
 381 *      Note that we don't need to lock the socket, as the upper poll layers
 382 *      take care of normal races (between the test and the event) and we don't
 383 *      go look at any of the socket buffers directly.
 384 */
 385unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 386{
 387        unsigned int mask;
 388        struct sock *sk = sock->sk;
 389        struct tcp_opt *tp = tcp_sk(sk);
 390
 391        poll_wait(file, sk->sk_sleep, wait);
 392        if (sk->sk_state == TCP_LISTEN)
 393                return tcp_listen_poll(sk, wait);
 394
 395        /* Socket is not locked. We are protected from async events
 396           by poll logic and correct handling of state changes
 397           made by another threads is impossible in any case.
 398         */
 399
 400        mask = 0;
 401        if (sk->sk_err)
 402                mask = POLLERR;
 403
 404        /*
 405         * POLLHUP is certainly not done right. But poll() doesn't
 406         * have a notion of HUP in just one direction, and for a
 407         * socket the read side is more interesting.
 408         *
 409         * Some poll() documentation says that POLLHUP is incompatible
 410         * with the POLLOUT/POLLWR flags, so somebody should check this
 411         * all. But careful, it tends to be safer to return too many
 412         * bits than too few, and you can easily break real applications
 413         * if you don't tell them that something has hung up!
 414         *
 415         * Check-me.
 416         *
 417         * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
 418         * our fs/select.c). It means that after we received EOF,
 419         * poll always returns immediately, making impossible poll() on write()
 420         * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
 421         * if and only if shutdown has been made in both directions.
 422         * Actually, it is interesting to look how Solaris and DUX
 423         * solve this dilemma. I would prefer, if PULLHUP were maskable,
 424         * then we could set it on SND_SHUTDOWN. BTW examples given
 425         * in Stevens' books assume exactly this behaviour, it explains
 426         * why PULLHUP is incompatible with POLLOUT.    --ANK
 427         *
 428         * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
 429         * blocking on fresh not-connected or disconnected socket. --ANK
 430         */
 431        if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
 432                mask |= POLLHUP;
 433        if (sk->sk_shutdown & RCV_SHUTDOWN)
 434                mask |= POLLIN | POLLRDNORM;
 435
 436        /* Connected? */
 437        if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 438                /* Potential race condition. If read of tp below will
 439                 * escape above sk->sk_state, we can be illegally awaken
 440                 * in SYN_* states. */
 441                if ((tp->rcv_nxt != tp->copied_seq) &&
 442                    (tp->urg_seq != tp->copied_seq ||
 443                     tp->rcv_nxt != tp->copied_seq + 1 ||
 444                     sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
 445                        mask |= POLLIN | POLLRDNORM;
 446
 447                if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
 448                        if (tcp_wspace(sk) >= tcp_min_write_space(sk)) {
 449                                mask |= POLLOUT | POLLWRNORM;
 450                        } else {  /* send SIGIO later */
 451                                set_bit(SOCK_ASYNC_NOSPACE,
 452                                        &sk->sk_socket->flags);
 453                                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 454
 455                                /* Race breaker. If space is freed after
 456                                 * wspace test but before the flags are set,
 457                                 * IO signal will be lost.
 458                                 */
 459                                if (tcp_wspace(sk) >= tcp_min_write_space(sk))
 460                                        mask |= POLLOUT | POLLWRNORM;
 461                        }
 462                }
 463
 464                if (tp->urg_data & TCP_URG_VALID)
 465                        mask |= POLLPRI;
 466        }
 467        return mask;
 468}
 469
 470/*
 471 *      TCP socket write_space callback.
 472 */
 473void tcp_write_space(struct sock *sk)
 474{
 475        struct socket *sock = sk->sk_socket;
 476
 477        if (tcp_wspace(sk) >= tcp_min_write_space(sk) && sock) {
 478                clear_bit(SOCK_NOSPACE, &sock->flags);
 479
 480                if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
 481                        wake_up_interruptible(sk->sk_sleep);
 482
 483                if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
 484                        sock_wake_async(sock, 2, POLL_OUT);
 485        }
 486}
 487
 488int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 489{
 490        struct tcp_opt *tp = tcp_sk(sk);
 491        int answ;
 492
 493        switch (cmd) {
 494        case SIOCINQ:
 495                if (sk->sk_state == TCP_LISTEN)
 496                        return -EINVAL;
 497
 498                lock_sock(sk);
 499                if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 500                        answ = 0;
 501                else if (sock_flag(sk, SOCK_URGINLINE) ||
 502                         !tp->urg_data ||
 503                         before(tp->urg_seq, tp->copied_seq) ||
 504                         !before(tp->urg_seq, tp->rcv_nxt)) {
 505                        answ = tp->rcv_nxt - tp->copied_seq;
 506
 507                        /* Subtract 1, if FIN is in queue. */
 508                        if (answ && !skb_queue_empty(&sk->sk_receive_queue))
 509                                answ -=
 510                       ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
 511                } else
 512                        answ = tp->urg_seq - tp->copied_seq;
 513                release_sock(sk);
 514                break;
 515        case SIOCATMARK:
 516                answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 517                break;
 518        case SIOCOUTQ:
 519                if (sk->sk_state == TCP_LISTEN)
 520                        return -EINVAL;
 521
 522                if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 523                        answ = 0;
 524                else
 525                        answ = tp->write_seq - tp->snd_una;
 526                break;
 527        default:
 528                return -ENOIOCTLCMD;
 529        };
 530
 531        return put_user(answ, (int *)arg);
 532}
 533
 534
 535int tcp_listen_start(struct sock *sk)
 536{
 537        struct inet_opt *inet = inet_sk(sk);
 538        struct tcp_opt *tp = tcp_sk(sk);
 539        struct tcp_listen_opt *lopt;
 540
 541        sk->sk_max_ack_backlog = 0;
 542        sk->sk_ack_backlog = 0;
 543        tp->accept_queue = tp->accept_queue_tail = NULL;
 544        tp->syn_wait_lock = RW_LOCK_UNLOCKED;
 545        tcp_delack_init(tp);
 546
 547        lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
 548        if (!lopt)
 549                return -ENOMEM;
 550
 551        memset(lopt, 0, sizeof(struct tcp_listen_opt));
 552        for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
 553                if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
 554                        break;
 555        get_random_bytes(&lopt->hash_rnd, 4);
 556
 557        write_lock_bh(&tp->syn_wait_lock);
 558        tp->listen_opt = lopt;
 559        write_unlock_bh(&tp->syn_wait_lock);
 560
 561        /* There is race window here: we announce ourselves listening,
 562         * but this transition is still not validated by get_port().
 563         * It is OK, because this socket enters to hash table only
 564         * after validation is complete.
 565         */
 566        sk->sk_state = TCP_LISTEN;
 567        if (!sk->sk_prot->get_port(sk, inet->num)) {
 568                inet->sport = htons(inet->num);
 569
 570                sk_dst_reset(sk);
 571                sk->sk_prot->hash(sk);
 572
 573                return 0;
 574        }
 575
 576        sk->sk_state = TCP_CLOSE;
 577        write_lock_bh(&tp->syn_wait_lock);
 578        tp->listen_opt = NULL;
 579        write_unlock_bh(&tp->syn_wait_lock);
 580        kfree(lopt);
 581        return -EADDRINUSE;
 582}
 583
 584/*
 585 *      This routine closes sockets which have been at least partially
 586 *      opened, but not yet accepted.
 587 */
 588
 589static void tcp_listen_stop (struct sock *sk)
 590{
 591        struct tcp_opt *tp = tcp_sk(sk);
 592        struct tcp_listen_opt *lopt = tp->listen_opt;
 593        struct open_request *acc_req = tp->accept_queue;
 594        struct open_request *req;
 595        int i;
 596
 597        tcp_delete_keepalive_timer(sk);
 598
 599        /* make all the listen_opt local to us */
 600        write_lock_bh(&tp->syn_wait_lock);
 601        tp->listen_opt = NULL;
 602        write_unlock_bh(&tp->syn_wait_lock);
 603        tp->accept_queue = tp->accept_queue_tail = NULL;
 604
 605        if (lopt->qlen) {
 606                for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
 607                        while ((req = lopt->syn_table[i]) != NULL) {
 608                                lopt->syn_table[i] = req->dl_next;
 609                                lopt->qlen--;
 610                                tcp_openreq_free(req);
 611
 612                /* Following specs, it would be better either to send FIN
 613                 * (and enter FIN-WAIT-1, it is normal close)
 614                 * or to send active reset (abort).
 615                 * Certainly, it is pretty dangerous while synflood, but it is
 616                 * bad justification for our negligence 8)
 617                 * To be honest, we are not able to make either
 618                 * of the variants now.                 --ANK
 619                 */
 620                        }
 621                }
 622        }
 623        BUG_TRAP(!lopt->qlen);
 624
 625        kfree(lopt);
 626
 627        while ((req = acc_req) != NULL) {
 628                struct sock *child = req->sk;
 629
 630                acc_req = req->dl_next;
 631
 632                local_bh_disable();
 633                bh_lock_sock(child);
 634                BUG_TRAP(!sock_owned_by_user(child));
 635                sock_hold(child);
 636
 637                tcp_disconnect(child, O_NONBLOCK);
 638
 639                sock_orphan(child);
 640
 641                atomic_inc(&tcp_orphan_count);
 642
 643                tcp_destroy_sock(child);
 644
 645                bh_unlock_sock(child);
 646                local_bh_enable();
 647                sock_put(child);
 648
 649                tcp_acceptq_removed(sk);
 650                tcp_openreq_fastfree(req);
 651        }
 652        BUG_TRAP(!sk->sk_ack_backlog);
 653}
 654
 655/*
 656 *      Wait for a socket to get into the connected state
 657 *
 658 *      Note: Must be called with the socket locked.
 659 */
 660static int wait_for_tcp_connect(struct sock *sk, int flags, long *timeo_p)
 661{
 662        struct tcp_opt *tp = tcp_sk(sk);
 663        struct task_struct *tsk = current;
 664        DEFINE_WAIT(wait);
 665
 666        while ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
 667                if (sk->sk_err)
 668                        return sock_error(sk);
 669                if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
 670                        return -EPIPE;
 671                if (!*timeo_p)
 672                        return -EAGAIN;
 673                if (signal_pending(tsk))
 674                        return sock_intr_errno(*timeo_p);
 675
 676                prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
 677                tp->write_pending++;
 678
 679                release_sock(sk);
 680                *timeo_p = schedule_timeout(*timeo_p);
 681                lock_sock(sk);
 682
 683                finish_wait(sk->sk_sleep, &wait);
 684                tp->write_pending--;
 685        }
 686        return 0;
 687}
 688
 689static inline int tcp_memory_free(struct sock *sk)
 690{
 691        return sk->sk_wmem_queued < sk->sk_sndbuf;
 692}
 693
 694/*
 695 *      Wait for more memory for a socket
 696 */
 697static int wait_for_tcp_memory(struct sock *sk, long *timeo)
 698{
 699        struct tcp_opt *tp = tcp_sk(sk);
 700        int err = 0;
 701        long vm_wait = 0;
 702        long current_timeo = *timeo;
 703        DEFINE_WAIT(wait);
 704
 705        if (tcp_memory_free(sk))
 706                current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2;
 707
 708        for (;;) {
 709                set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 710
 711                prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
 712
 713                if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 714                        goto do_error;
 715                if (!*timeo)
 716                        goto do_nonblock;
 717                if (signal_pending(current))
 718                        goto do_interrupted;
 719                clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 720                if (tcp_memory_free(sk) && !vm_wait)
 721                        break;
 722
 723                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 724                tp->write_pending++;
 725                release_sock(sk);
 726                if (!tcp_memory_free(sk) || vm_wait)
 727                        current_timeo = schedule_timeout(current_timeo);
 728                lock_sock(sk);
 729                tp->write_pending--;
 730
 731                if (vm_wait) {
 732                        vm_wait -= current_timeo;
 733                        current_timeo = *timeo;
 734                        if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
 735                            (current_timeo -= vm_wait) < 0)
 736                                current_timeo = 0;
 737                        vm_wait = 0;
 738                }
 739                *timeo = current_timeo;
 740        }
 741out:
 742        finish_wait(sk->sk_sleep, &wait);
 743        return err;
 744
 745do_error:
 746        err = -EPIPE;
 747        goto out;
 748do_nonblock:
 749        err = -EAGAIN;
 750        goto out;
 751do_interrupted:
 752        err = sock_intr_errno(*timeo);
 753        goto out;
 754}
 755
 756ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
 757                         size_t psize, int flags);
 758
 759static inline int can_coalesce(struct sk_buff *skb, int i, struct page *page,
 760                               int off)
 761{
 762        if (i) {
 763                skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
 764                return page == frag->page &&
 765                       off == frag->page_offset + frag->size;
 766        }
 767        return 0;
 768}
 769
 770static inline void fill_page_desc(struct sk_buff *skb, int i,
 771                                  struct page *page, int off, int size)
 772{
 773        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 774        frag->page = page;
 775        frag->page_offset = off;
 776        frag->size = size;
 777        skb_shinfo(skb)->nr_frags = i + 1;
 778}
 779
 780static inline void tcp_mark_push(struct tcp_opt *tp, struct sk_buff *skb)
 781{
 782        TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 783        tp->pushed_seq = tp->write_seq;
 784}
 785
 786static inline int forced_push(struct tcp_opt *tp)
 787{
 788        return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
 789}
 790
 791static inline void skb_entail(struct sock *sk, struct tcp_opt *tp,
 792                              struct sk_buff *skb)
 793{
 794        skb->csum = 0;
 795        TCP_SKB_CB(skb)->seq = tp->write_seq;
 796        TCP_SKB_CB(skb)->end_seq = tp->write_seq;
 797        TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
 798        TCP_SKB_CB(skb)->sacked = 0;
 799        __skb_queue_tail(&sk->sk_write_queue, skb);
 800        tcp_charge_skb(sk, skb);
 801        if (!tp->send_head)
 802                tp->send_head = skb;
 803        else if (tp->nonagle&TCP_NAGLE_PUSH)
 804                tp->nonagle &= ~TCP_NAGLE_PUSH; 
 805}
 806
 807static inline void tcp_mark_urg(struct tcp_opt *tp, int flags,
 808                                struct sk_buff *skb)
 809{
 810        if (flags & MSG_OOB) {
 811                tp->urg_mode = 1;
 812                tp->snd_up = tp->write_seq;
 813                TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
 814        }
 815}
 816
 817static inline void tcp_push(struct sock *sk, struct tcp_opt *tp, int flags,
 818                            int mss_now, int nonagle)
 819{
 820        if (tp->send_head) {
 821                struct sk_buff *skb = sk->sk_write_queue.prev;
 822                if (!(flags & MSG_MORE) || forced_push(tp))
 823                        tcp_mark_push(tp, skb);
 824                tcp_mark_urg(tp, flags, skb);
 825                __tcp_push_pending_frames(sk, tp, mss_now,
 826                                          (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
 827        }
 828}
 829
 830static int tcp_error(struct sock *sk, int flags, int err)
 831{
 832        if (err == -EPIPE)
 833                err = sock_error(sk) ? : -EPIPE;
 834        if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
 835                send_sig(SIGPIPE, current, 0);
 836        return err;
 837}
 838
 839ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
 840                         size_t psize, int flags)
 841{
 842        struct tcp_opt *tp = tcp_sk(sk);
 843        int mss_now;
 844        int err;
 845        ssize_t copied;
 846        long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 847
 848        /* Wait for a connection to finish. */
 849        if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 850                if ((err = wait_for_tcp_connect(sk, 0, &timeo)) != 0)
 851                        goto out_err;
 852
 853        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 854
 855        mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 856        copied = 0;
 857
 858        err = -EPIPE;
 859        if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 860                goto do_error;
 861
 862        while (psize > 0) {
 863                struct sk_buff *skb = sk->sk_write_queue.prev;
 864                struct page *page = pages[poffset / PAGE_SIZE];
 865                int copy, i;
 866                int offset = poffset % PAGE_SIZE;
 867                int size = min_t(size_t, psize, PAGE_SIZE - offset);
 868
 869                if (!tp->send_head || (copy = mss_now - skb->len) <= 0) {
 870new_segment:
 871                        if (!tcp_memory_free(sk))
 872                                goto wait_for_sndbuf;
 873
 874                        skb = tcp_alloc_pskb(sk, 0, tp->mss_cache,
 875                                             sk->sk_allocation);
 876                        if (!skb)
 877                                goto wait_for_memory;
 878
 879                        skb_entail(sk, tp, skb);
 880                        copy = mss_now;
 881                }
 882
 883                if (copy > size)
 884                        copy = size;
 885
 886                i = skb_shinfo(skb)->nr_frags;
 887                if (can_coalesce(skb, i, page, offset)) {
 888                        skb_shinfo(skb)->frags[i - 1].size += copy;
 889                } else if (i < MAX_SKB_FRAGS) {
 890                        get_page(page);
 891                        fill_page_desc(skb, i, page, offset, copy);
 892                } else {
 893                        tcp_mark_push(tp, skb);
 894                        goto new_segment;
 895                }
 896
 897                skb->len += copy;
 898                skb->data_len += copy;
 899                skb->ip_summed = CHECKSUM_HW;
 900                tp->write_seq += copy;
 901                TCP_SKB_CB(skb)->end_seq += copy;
 902
 903                if (!copied)
 904                        TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 905
 906                copied += copy;
 907                poffset += copy;
 908                if (!(psize -= copy))
 909                        goto out;
 910
 911                if (skb->len != mss_now || (flags & MSG_OOB))
 912                        continue;
 913
 914                if (forced_push(tp)) {
 915                        tcp_mark_push(tp, skb);
 916                        __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
 917                } else if (skb == tp->send_head)
 918                        tcp_push_one(sk, mss_now);
 919                continue;
 920
 921wait_for_sndbuf:
 922                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 923wait_for_memory:
 924                if (copied)
 925                        tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 926
 927                if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
 928                        goto do_error;
 929
 930                mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 931        }
 932
 933out:
 934        if (copied)
 935                tcp_push(sk, tp, flags, mss_now, tp->nonagle);
 936        return copied;
 937
 938do_error:
 939        if (copied)
 940                goto out;
 941out_err:
 942        return tcp_error(sk, flags, err);
 943}
 944
 945ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
 946                     size_t size, int flags)
 947{
 948        ssize_t res;
 949        struct sock *sk = sock->sk;
 950
 951#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
 952
 953        if (!(sk->sk_route_caps & NETIF_F_SG) ||
 954            !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
 955                return sock_no_sendpage(sock, page, offset, size, flags);
 956
 957#undef TCP_ZC_CSUM_FLAGS
 958
 959        lock_sock(sk);
 960        TCP_CHECK_TIMER(sk);
 961        res = do_tcp_sendpages(sk, &page, offset, size, flags);
 962        TCP_CHECK_TIMER(sk);
 963        release_sock(sk);
 964        return res;
 965}
 966
 967#define TCP_PAGE(sk)    (inet_sk(sk)->sndmsg_page)
 968#define TCP_OFF(sk)     (inet_sk(sk)->sndmsg_off)
 969
 970static inline int tcp_copy_to_page(struct sock *sk, char *from,
 971                                   struct sk_buff *skb, struct page *page,
 972                                   int off, int copy)
 973{
 974        int err = 0;
 975        unsigned int csum;
 976
 977        if (skb->ip_summed == CHECKSUM_NONE) {
 978                csum = csum_and_copy_from_user(from, page_address(page) + off,
 979                                       copy, 0, &err);
 980                if (err) return err;
 981                skb->csum = csum_block_add(skb->csum, csum, skb->len);
 982        } else {
 983                if (copy_from_user(page_address(page) + off, from, copy))
 984                        return -EFAULT;
 985        }
 986
 987        skb->len += copy;
 988        skb->data_len += copy;
 989        skb->truesize += copy;
 990        sk->sk_wmem_queued += copy;
 991        sk->sk_forward_alloc -= copy;
 992        return 0;
 993}
 994
 995static inline int skb_add_data(struct sk_buff *skb, char *from, int copy)
 996{
 997        int err = 0;
 998        unsigned int csum;
 999        int off = skb->len;
1000
1001        if (skb->ip_summed == CHECKSUM_NONE) {
1002                csum = csum_and_copy_from_user(from, skb_put(skb, copy),
1003                                       copy, 0, &err);
1004                if (!err) {
1005                        skb->csum = csum_block_add(skb->csum, csum, off);
1006                        return 0;
1007                }
1008        } else {
1009                if (!copy_from_user(skb_put(skb, copy), from, copy))
1010                        return 0;
1011        }
1012
1013        __skb_trim(skb, off);
1014        return -EFAULT;
1015}
1016
1017static inline int select_size(struct sock *sk, struct tcp_opt *tp)
1018{
1019        int tmp = tp->mss_cache_std;
1020
1021        if (sk->sk_route_caps & NETIF_F_SG) {
1022                int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
1023
1024                if (tmp >= pgbreak &&
1025                    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
1026                        tmp = pgbreak;
1027        }
1028        return tmp;
1029}
1030
1031int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1032                int size)
1033{
1034        struct iovec *iov;
1035        struct tcp_opt *tp = tcp_sk(sk);
1036        struct sk_buff *skb;
1037        int iovlen, flags;
1038        int mss_now;
1039        int err, copied;
1040        long timeo;
1041
1042        lock_sock(sk);
1043        TCP_CHECK_TIMER(sk);
1044
1045        flags = msg->msg_flags;
1046        timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1047
1048        /* Wait for a connection to finish. */
1049        if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1050                if ((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
1051                        goto out_err;
1052
1053        /* This should be in poll */
1054        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1055
1056        mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1057
1058        /* Ok commence sending. */
1059        iovlen = msg->msg_iovlen;
1060        iov = msg->msg_iov;
1061        copied = 0;
1062
1063        err = -EPIPE;
1064        if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1065                goto do_error;
1066
1067        while (--iovlen >= 0) {
1068                int seglen = iov->iov_len;
1069                unsigned char *from = iov->iov_base;
1070
1071                iov++;
1072
1073                while (seglen > 0) {
1074                        int copy;
1075
1076                        skb = sk->sk_write_queue.prev;
1077
1078                        if (!tp->send_head ||
1079                            (copy = mss_now - skb->len) <= 0) {
1080
1081new_segment:
1082                                /* Allocate new segment. If the interface is SG,
1083                                 * allocate skb fitting to single page.
1084                                 */
1085                                if (!tcp_memory_free(sk))
1086                                        goto wait_for_sndbuf;
1087
1088                                skb = tcp_alloc_pskb(sk, select_size(sk, tp),
1089                                                     0, sk->sk_allocation);
1090                                if (!skb)
1091                                        goto wait_for_memory;
1092
1093                                /*
1094                                 * Check whether we can use HW checksum.
1095                                 */
1096                                if (sk->sk_route_caps &
1097                                    (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
1098                                     NETIF_F_HW_CSUM))
1099                                        skb->ip_summed = CHECKSUM_HW;
1100
1101                                skb_entail(sk, tp, skb);
1102                                copy = mss_now;
1103                        }
1104
1105                        /* Try to append data to the end of skb. */
1106                        if (copy > seglen)
1107                                copy = seglen;
1108
1109                        /* Where to copy to? */
1110                        if (skb_tailroom(skb) > 0) {
1111                                /* We have some space in skb head. Superb! */
1112                                if (copy > skb_tailroom(skb))
1113                                        copy = skb_tailroom(skb);
1114                                if ((err = skb_add_data(skb, from, copy)) != 0)
1115                                        goto do_fault;
1116                        } else {
1117                                int merge = 0;
1118                                int i = skb_shinfo(skb)->nr_frags;
1119                                struct page *page = TCP_PAGE(sk);
1120                                int off = TCP_OFF(sk);
1121
1122                                if (can_coalesce(skb, i, page, off) &&
1123                                    off != PAGE_SIZE) {
1124                                        /* We can extend the last page
1125                                         * fragment. */
1126                                        merge = 1;
1127                                } else if (i == MAX_SKB_FRAGS ||
1128                                           (!i &&
1129                                           !(sk->sk_route_caps & NETIF_F_SG))) {
1130                                        /* Need to add new fragment and cannot
1131                                         * do this because interface is non-SG,
1132                                         * or because all the page slots are
1133                                         * busy. */
1134                                        tcp_mark_push(tp, skb);
1135                                        goto new_segment;
1136                                } else if (page) {
1137                                        /* If page is cached, align
1138                                         * offset to L1 cache boundary
1139                                         */
1140                                        off = (off + L1_CACHE_BYTES - 1) &
1141                                              ~(L1_CACHE_BYTES - 1);
1142                                        if (off == PAGE_SIZE) {
1143                                                put_page(page);
1144                                                TCP_PAGE(sk) = page = NULL;
1145                                        }
1146                                }
1147
1148                                if (!page) {
1149                                        /* Allocate new cache page. */
1150                                        if (!(page = tcp_alloc_page(sk)))
1151                                                goto wait_for_memory;
1152                                        off = 0;
1153                                }
1154
1155                                if (copy > PAGE_SIZE - off)
1156                                        copy = PAGE_SIZE - off;
1157
1158                                /* Time to copy data. We are close to
1159                                 * the end! */
1160                                err = tcp_copy_to_page(sk, from, skb, page,
1161                                                       off, copy);
1162                                if (err) {
1163                                        /* If this page was new, give it to the
1164                                         * socket so it does not get leaked.
1165                                         */
1166                                        if (!TCP_PAGE(sk)) {
1167                                                TCP_PAGE(sk) = page;
1168                                                TCP_OFF(sk) = 0;
1169                                        }
1170                                        goto do_error;
1171                                }
1172
1173                                /* Update the skb. */
1174                                if (merge) {
1175                                        skb_shinfo(skb)->frags[i - 1].size +=
1176                                                                        copy;
1177                                } else {
1178                                        fill_page_desc(skb, i, page, off, copy);
1179                                        if (TCP_PAGE(sk)) {
1180                                                get_page(page);
1181                                        } else if (off + copy < PAGE_SIZE) {
1182                                                get_page(page);
1183                                                TCP_PAGE(sk) = page;
1184                                        }
1185                                }
1186
1187                                TCP_OFF(sk) = off + copy;
1188                        }
1189
1190                        if (!copied)
1191                                TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
1192
1193                        tp->write_seq += copy;
1194                        TCP_SKB_CB(skb)->end_seq += copy;
1195
1196                        from += copy;
1197                        copied += copy;
1198                        if ((seglen -= copy) == 0 && iovlen == 0)
1199                                goto out;
1200
1201                        if (skb->len != mss_now || (flags & MSG_OOB))
1202                                continue;
1203
1204                        if (forced_push(tp)) {
1205                                tcp_mark_push(tp, skb);
1206                                __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
1207                        } else if (skb == tp->send_head)
1208                                tcp_push_one(sk, mss_now);
1209                        continue;
1210
1211wait_for_sndbuf:
1212                        set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1213wait_for_memory:
1214                        if (copied)
1215                                tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1216
1217                        if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
1218                                goto do_error;
1219
1220                        mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1221                }
1222        }
1223
1224out:
1225        if (copied)
1226                tcp_push(sk, tp, flags, mss_now, tp->nonagle);
1227        TCP_CHECK_TIMER(sk);
1228        release_sock(sk);
1229        return copied;
1230
1231do_fault:
1232        if (!skb->len) {
1233                if (tp->send_head == skb)
1234                        tp->send_head = NULL;
1235                __skb_unlink(skb, skb->list);
1236                tcp_free_skb(sk, skb);
1237        }
1238
1239do_error:
1240        if (copied)
1241                goto out;
1242out_err:
1243        err = tcp_error(sk, flags, err);
1244        TCP_CHECK_TIMER(sk);
1245        release_sock(sk);
1246        return err;
1247}
1248
1249/*
1250 *      Handle reading urgent data. BSD has very simple semantics for
1251 *      this, no blocking and very strange errors 8)
1252 */
1253
1254static int tcp_recv_urg(struct sock *sk, long timeo,
1255                        struct msghdr *msg, int len, int flags,
1256                        int *addr_len)
1257{
1258        struct tcp_opt *tp = tcp_sk(sk);
1259
1260        /* No URG data to read. */
1261        if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1262            tp->urg_data == TCP_URG_READ)
1263                return -EINVAL; /* Yes this is right ! */
1264
1265        if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1266                return -ENOTCONN;
1267
1268        if (tp->urg_data & TCP_URG_VALID) {
1269                int err = 0;
1270                char c = tp->urg_data;
1271
1272                if (!(flags & MSG_PEEK))
1273                        tp->urg_data = TCP_URG_READ;
1274
1275                /* Read urgent data. */
1276                msg->msg_flags |= MSG_OOB;
1277
1278                if (len > 0) {
1279                        if (!(flags & MSG_TRUNC))
1280                                err = memcpy_toiovec(msg->msg_iov, &c, 1);
1281                        len = 1;
1282                } else
1283                        msg->msg_flags |= MSG_TRUNC;
1284
1285                return err ? -EFAULT : len;
1286        }
1287
1288        if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1289                return 0;
1290
1291        /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1292         * the available implementations agree in this case:
1293         * this call should never block, independent of the
1294         * blocking state of the socket.
1295         * Mike <pall@rz.uni-karlsruhe.de>
1296         */
1297        return -EAGAIN;
1298}
1299
1300/*
1301 *      Release a skb if it is no longer needed. This routine
1302 *      must be called with interrupts disabled or with the
1303 *      socket locked so that the sk_buff queue operation is ok.
1304 */
1305
1306static inline void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
1307{
1308        __skb_unlink(skb, &sk->sk_receive_queue);
1309        __kfree_skb(skb);
1310}
1311
1312/* Clean up the receive buffer for full frames taken by the user,
1313 * then send an ACK if necessary.  COPIED is the number of bytes
1314 * tcp_recvmsg has given to the user so far, it speeds up the
1315 * calculation of whether or not we must ACK for the sake of
1316 * a window update.
1317 */
1318static void cleanup_rbuf(struct sock *sk, int copied)
1319{
1320        struct tcp_opt *tp = tcp_sk(sk);
1321        int time_to_ack = 0;
1322
1323#if TCP_DEBUG
1324        struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1325
1326        BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1327#endif
1328
1329        if (tcp_ack_scheduled(tp)) {
1330                   /* Delayed ACKs frequently hit locked sockets during bulk
1331                    * receive. */
1332                if (tp->ack.blocked ||
1333                    /* Once-per-two-segments ACK was not sent by tcp_input.c */
1334                    tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1335                    /*
1336                     * If this read emptied read buffer, we send ACK, if
1337                     * connection is not bidirectional, user drained
1338                     * receive buffer and there was a small segment
1339                     * in queue.
1340                     */
1341                    (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1342                     !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1343                        time_to_ack = 1;
1344        }
1345
1346        /* We send an ACK if we can now advertise a non-zero window
1347         * which has been raised "significantly".
1348         *
1349         * Even if window raised up to infinity, do not send window open ACK
1350         * in states, where we will not receive more. It is useless.
1351         */
1352        if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1353                __u32 rcv_window_now = tcp_receive_window(tp);
1354
1355                /* Optimize, __tcp_select_window() is not cheap. */
1356                if (2*rcv_window_now <= tp->window_clamp) {
1357                        __u32 new_window = __tcp_select_window(sk);
1358
1359                        /* Send ACK now, if this read freed lots of space
1360                         * in our buffer. Certainly, new_window is new window.
1361                         * We can advertise it now, if it is not less than current one.
1362                         * "Lots" means "at least twice" here.
1363                         */
1364                        if (new_window && new_window >= 2 * rcv_window_now)
1365                                time_to_ack = 1;
1366                }
1367        }
1368        if (time_to_ack)
1369                tcp_send_ack(sk);
1370}
1371
1372/* Now socket state including sk->sk_err is changed only under lock,
1373 * hence we may omit checks after joining wait queue.
1374 * We check receive queue before schedule() only as optimization;
1375 * it is very likely that release_sock() added new data.
1376 */
1377
1378static long tcp_data_wait(struct sock *sk, long timeo)
1379{
1380        DEFINE_WAIT(wait);
1381
1382        prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1383
1384        set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1385        release_sock(sk);
1386
1387        if (skb_queue_empty(&sk->sk_receive_queue))
1388                timeo = schedule_timeout(timeo);
1389
1390        lock_sock(sk);
1391        clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1392
1393        finish_wait(sk->sk_sleep, &wait);
1394        return timeo;
1395}
1396
1397static void tcp_prequeue_process(struct sock *sk)
1398{
1399        struct sk_buff *skb;
1400        struct tcp_opt *tp = tcp_sk(sk);
1401
1402        NET_ADD_STATS_USER(TCPPrequeued, skb_queue_len(&tp->ucopy.prequeue));
1403
1404        /* RX process wants to run with disabled BHs, though it is not
1405         * necessary */
1406        local_bh_disable();
1407        while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1408                sk->sk_backlog_rcv(sk, skb);
1409        local_bh_enable();
1410
1411        /* Clear memory counter. */
1412        tp->ucopy.memory = 0;
1413}
1414
1415static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1416{
1417        struct sk_buff *skb;
1418        u32 offset;
1419
1420        skb_queue_walk(&sk->sk_receive_queue, skb) {
1421                offset = seq - TCP_SKB_CB(skb)->seq;
1422                if (skb->h.th->syn)
1423                        offset--;
1424                if (offset < skb->len || skb->h.th->fin) {
1425                        *off = offset;
1426                        return skb;
1427                }
1428        }
1429        return NULL;
1430}
1431
1432/*
1433 * This routine provides an alternative to tcp_recvmsg() for routines
1434 * that would like to handle copying from skbuffs directly in 'sendfile'
1435 * fashion.
1436 * Note:
1437 *      - It is assumed that the socket was locked by the caller.
1438 *      - The routine does not block.
1439 *      - At present, there is no support for reading OOB data
1440 *        or for 'peeking' the socket using this routine
1441 *        (although both would be easy to implement).
1442 */
1443int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1444                  sk_read_actor_t recv_actor)
1445{
1446        struct sk_buff *skb;
1447        struct tcp_opt *tp = tcp_sk(sk);
1448        u32 seq = tp->copied_seq;
1449        u32 offset;
1450        int copied = 0;
1451
1452        if (sk->sk_state == TCP_LISTEN)
1453                return -ENOTCONN;
1454        while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1455                if (offset < skb->len) {
1456                        size_t used, len;
1457
1458                        len = skb->len - offset;
1459                        /* Stop reading if we hit a patch of urgent data */
1460                        if (tp->urg_data) {
1461                                u32 urg_offset = tp->urg_seq - seq;
1462                                if (urg_offset < len)
1463                                        len = urg_offset;
1464                                if (!len)
1465                                        break;
1466                        }
1467                        used = recv_actor(desc, skb, offset, len);
1468                        if (used <= len) {
1469                                seq += used;
1470                                copied += used;
1471                                offset += used;
1472                        }
1473                        if (offset != skb->len)
1474                                break;
1475                }
1476                if (skb->h.th->fin) {
1477                        tcp_eat_skb(sk, skb);
1478                        ++seq;
1479                        break;
1480                }
1481                tcp_eat_skb(sk, skb);
1482                if (!desc->count)
1483                        break;
1484        }
1485        tp->copied_seq = seq;
1486        /* Clean up data we have read: This will do ACK frames. */
1487        if (copied)
1488                cleanup_rbuf(sk, copied);
1489        return copied;
1490}
1491
1492/*
1493 *      This routine copies from a sock struct into the user buffer.
1494 *
1495 *      Technical note: in 2.3 we work on _locked_ socket, so that
1496 *      tricks with *seq access order and skb->users are not required.
1497 *      Probably, code can be easily improved even more.
1498 */
1499
1500int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1501                int len, int nonblock, int flags, int *addr_len)
1502{
1503        struct tcp_opt *tp = tcp_sk(sk);
1504        int copied = 0;
1505        u32 peek_seq;
1506        u32 *seq;
1507        unsigned long used;
1508        int err;
1509        int target;             /* Read at least this many bytes */
1510        long timeo;
1511        struct task_struct *user_recv = NULL;
1512
1513        lock_sock(sk);
1514
1515        TCP_CHECK_TIMER(sk);
1516
1517        err = -ENOTCONN;
1518        if (sk->sk_state == TCP_LISTEN)
1519                goto out;
1520
1521        timeo = sock_rcvtimeo(sk, nonblock);
1522
1523        /* Urgent data needs to be handled specially. */
1524        if (flags & MSG_OOB)
1525                goto recv_urg;
1526
1527        seq = &tp->copied_seq;
1528        if (flags & MSG_PEEK) {
1529                peek_seq = tp->copied_seq;
1530                seq = &peek_seq;
1531        }
1532
1533        target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1534
1535        do {
1536                struct sk_buff *skb;
1537                u32 offset;
1538
1539                /* Are we at urgent data? Stop if we have read anything. */
1540                if (copied && tp->urg_data && tp->urg_seq == *seq)
1541                        break;
1542
1543                /* We need to check signals first, to get correct SIGURG
1544                 * handling. FIXME: Need to check this doesn't impact 1003.1g
1545                 * and move it down to the bottom of the loop
1546                 */
1547                if (signal_pending(current)) {
1548                        if (copied)
1549                                break;
1550                        copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1551                        break;
1552                }
1553
1554                /* Next get a buffer. */
1555
1556                skb = skb_peek(&sk->sk_receive_queue);
1557                do {
1558                        if (!skb)
1559                                break;
1560
1561                        /* Now that we have two receive queues this
1562                         * shouldn't happen.
1563                         */
1564                        if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1565                                printk(KERN_INFO "recvmsg bug: copied %X "
1566                                       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1567                                break;
1568                        }
1569                        offset = *seq - TCP_SKB_CB(skb)->seq;
1570                        if (skb->h.th->syn)
1571                                offset--;
1572                        if (offset < skb->len)
1573                                goto found_ok_skb;
1574                        if (skb->h.th->fin)
1575                                goto found_fin_ok;
1576                        BUG_TRAP(flags & MSG_PEEK);
1577                        skb = skb->next;
1578                } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1579
1580                /* Well, if we have backlog, try to process it now yet. */
1581
1582                if (copied >= target && !sk->sk_backlog.tail)
1583                        break;
1584
1585                if (copied) {
1586                        if (sk->sk_err ||
1587                            sk->sk_state == TCP_CLOSE ||
1588                            (sk->sk_shutdown & RCV_SHUTDOWN) ||
1589                            !timeo ||
1590                            (flags & MSG_PEEK))
1591                                break;
1592                } else {
1593                        if (sock_flag(sk, SOCK_DONE))
1594                                break;
1595
1596                        if (sk->sk_err) {
1597                                copied = sock_error(sk);
1598                                break;
1599                        }
1600
1601                        if (sk->sk_shutdown & RCV_SHUTDOWN)
1602                                break;
1603
1604                        if (sk->sk_state == TCP_CLOSE) {
1605                                if (!sock_flag(sk, SOCK_DONE)) {
1606                                        /* This occurs when user tries to read
1607                                         * from never connected socket.
1608                                         */
1609                                        copied = -ENOTCONN;
1610                                        break;
1611                                }
1612                                break;
1613                        }
1614
1615                        if (!timeo) {
1616                                copied = -EAGAIN;
1617                                break;
1618                        }
1619                }
1620
1621                cleanup_rbuf(sk, copied);
1622
1623                if (tp->ucopy.task == user_recv) {
1624                        /* Install new reader */
1625                        if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1626                                user_recv = current;
1627                                tp->ucopy.task = user_recv;
1628                                tp->ucopy.iov = msg->msg_iov;
1629                        }
1630
1631                        tp->ucopy.len = len;
1632
1633                        BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1634                                 (flags & (MSG_PEEK | MSG_TRUNC)));
1635
1636                        /* Ugly... If prequeue is not empty, we have to
1637                         * process it before releasing socket, otherwise
1638                         * order will be broken at second iteration.
1639                         * More elegant solution is required!!!
1640                         *
1641                         * Look: we have the following (pseudo)queues:
1642                         *
1643                         * 1. packets in flight
1644                         * 2. backlog
1645                         * 3. prequeue
1646                         * 4. receive_queue
1647                         *
1648                         * Each queue can be processed only if the next ones
1649                         * are empty. At this point we have empty receive_queue.
1650                         * But prequeue _can_ be not empty after 2nd iteration,
1651                         * when we jumped to start of loop because backlog
1652                         * processing added something to receive_queue.
1653                         * We cannot release_sock(), because backlog contains
1654                         * packets arrived _after_ prequeued ones.
1655                         *
1656                         * Shortly, algorithm is clear --- to process all
1657                         * the queues in order. We could make it more directly,
1658                         * requeueing packets from backlog to prequeue, if
1659                         * is not empty. It is more elegant, but eats cycles,
1660                         * unfortunately.
1661                         */
1662                        if (skb_queue_len(&tp->ucopy.prequeue))
1663                                goto do_prequeue;
1664
1665                        /* __ Set realtime policy in scheduler __ */
1666                }
1667
1668                if (copied >= target) {
1669                        /* Do not sleep, just process backlog. */
1670                        release_sock(sk);
1671                        lock_sock(sk);
1672                } else {
1673                        timeo = tcp_data_wait(sk, timeo);
1674                }
1675
1676                if (user_recv) {
1677                        int chunk;
1678
1679                        /* __ Restore normal policy in scheduler __ */
1680
1681                        if ((chunk = len - tp->ucopy.len) != 0) {
1682                                NET_ADD_STATS_USER(TCPDirectCopyFromBacklog, chunk);
1683                                len -= chunk;
1684                                copied += chunk;
1685                        }
1686
1687                        if (tp->rcv_nxt == tp->copied_seq &&
1688                            skb_queue_len(&tp->ucopy.prequeue)) {
1689do_prequeue:
1690                                tcp_prequeue_process(sk);
1691
1692                                if ((chunk = len - tp->ucopy.len) != 0) {
1693                                        NET_ADD_STATS_USER(TCPDirectCopyFromPrequeue, chunk);
1694                                        len -= chunk;
1695                                        copied += chunk;
1696                                }
1697                        }
1698                }
1699                if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1700                        if (net_ratelimit())
1701                                printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1702                                       current->comm, current->pid);
1703                        peek_seq = tp->copied_seq;
1704                }
1705                continue;
1706
1707        found_ok_skb:
1708                /* Ok so how much can we use? */
1709                used = skb->len - offset;
1710                if (len < used)
1711                        used = len;
1712
1713                /* Do we have urgent data here? */
1714                if (tp->urg_data) {
1715                        u32 urg_offset = tp->urg_seq - *seq;
1716                        if (urg_offset < used) {
1717                                if (!urg_offset) {
1718                                        if (!sock_flag(sk, SOCK_URGINLINE)) {
1719                                                ++*seq;
1720                                                offset++;
1721                                                used--;
1722                                                if (!used)
1723                                                        goto skip_copy;
1724                                        }
1725                                } else
1726                                        used = urg_offset;
1727                        }
1728                }
1729
1730                if (!(flags & MSG_TRUNC)) {
1731                        err = skb_copy_datagram_iovec(skb, offset,
1732                                                      msg->msg_iov, used);
1733                        if (err) {
1734                                /* Exception. Bailout! */
1735                                if (!copied)
1736                                        copied = -EFAULT;
1737                                break;
1738                        }
1739                }
1740
1741                *seq += used;
1742                copied += used;
1743                len -= used;
1744
1745skip_copy:
1746                if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1747                        tp->urg_data = 0;
1748                        tcp_fast_path_check(sk, tp);
1749                }
1750                if (used + offset < skb->len)
1751                        continue;
1752
1753                if (skb->h.th->fin)
1754                        goto found_fin_ok;
1755                if (!(flags & MSG_PEEK))
1756                        tcp_eat_skb(sk, skb);
1757                continue;
1758
1759        found_fin_ok:
1760                /* Process the FIN. */
1761                ++*seq;
1762                if (!(flags & MSG_PEEK))
1763                        tcp_eat_skb(sk, skb);
1764                break;
1765        } while (len > 0);
1766
1767        if (user_recv) {
1768                if (skb_queue_len(&tp->ucopy.prequeue)) {
1769                        int chunk;
1770
1771                        tp->ucopy.len = copied > 0 ? len : 0;
1772
1773                        tcp_prequeue_process(sk);
1774
1775                        if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1776                                NET_ADD_STATS_USER(TCPDirectCopyFromPrequeue, chunk);
1777                                len -= chunk;
1778                                copied += chunk;
1779                        }
1780                }
1781
1782                tp->ucopy.task = NULL;
1783                tp->ucopy.len = 0;
1784        }
1785
1786        /* According to UNIX98, msg_name/msg_namelen are ignored
1787         * on connected socket. I was just happy when found this 8) --ANK
1788         */
1789
1790        /* Clean up data we have read: This will do ACK frames. */
1791        cleanup_rbuf(sk, copied);
1792
1793        TCP_CHECK_TIMER(sk);
1794        release_sock(sk);
1795        return copied;
1796
1797out:
1798        TCP_CHECK_TIMER(sk);
1799        release_sock(sk);
1800        return err;
1801
1802recv_urg:
1803        err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1804        goto out;
1805}
1806
1807/*
1808 *      State processing on a close. This implements the state shift for
1809 *      sending our FIN frame. Note that we only send a FIN for some
1810 *      states. A shutdown() may have already sent the FIN, or we may be
1811 *      closed.
1812 */
1813
1814static unsigned char new_state[16] = {
1815  /* current state:        new state:      action:      */
1816  /* (Invalid)          */ TCP_CLOSE,
1817  /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1818  /* TCP_SYN_SENT       */ TCP_CLOSE,
1819  /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1820  /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1821  /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1822  /* TCP_TIME_WAIT      */ TCP_CLOSE,
1823  /* TCP_CLOSE          */ TCP_CLOSE,
1824  /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1825  /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1826  /* TCP_LISTEN         */ TCP_CLOSE,
1827  /* TCP_CLOSING        */ TCP_CLOSING,
1828};
1829
1830static int tcp_close_state(struct sock *sk)
1831{
1832        int next = (int)new_state[sk->sk_state];
1833        int ns = next & TCP_STATE_MASK;
1834
1835        tcp_set_state(sk, ns);
1836
1837        return next & TCP_ACTION_FIN;
1838}
1839
1840/*
1841 *      Shutdown the sending side of a connection. Much like close except
1842 *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1843 */
1844
1845void tcp_shutdown(struct sock *sk, int how)
1846{
1847        /*      We need to grab some memory, and put together a FIN,
1848         *      and then put it into the queue to be sent.
1849         *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1850         */
1851        if (!(how & SEND_SHUTDOWN))
1852                return;
1853
1854        /* If we've already sent a FIN, or it's a closed state, skip this. */
1855        if ((1 << sk->sk_state) &
1856            (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1857             TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1858                /* Clear out any half completed packets.  FIN if needed. */
1859                if (tcp_close_state(sk))
1860                        tcp_send_fin(sk);
1861        }
1862}
1863
1864
1865/*
1866 *      Return 1 if we still have things to send in our buffers.
1867 */
1868
1869static inline int closing(struct sock *sk)
1870{
1871        return (1 << sk->sk_state) &
1872               (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK);
1873}
1874
1875static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1876{
1877        /* First the read buffer. */
1878        __skb_queue_purge(&sk->sk_receive_queue);
1879
1880        /* Next, the error queue. */
1881        __skb_queue_purge(&sk->sk_error_queue);
1882
1883        /* Next, the write queue. */
1884        BUG_TRAP(skb_queue_empty(&sk->sk_write_queue));
1885
1886        /* Account for returned memory. */
1887        tcp_mem_reclaim(sk);
1888
1889        BUG_TRAP(!sk->sk_wmem_queued);
1890        BUG_TRAP(!sk->sk_forward_alloc);
1891
1892        /* It is _impossible_ for the backlog to contain anything
1893         * when we get here.  All user references to this socket
1894         * have gone away, only the net layer knows can touch it.
1895         */
1896}
1897
1898/*
1899 * At this point, there should be no process reference to this
1900 * socket, and thus no user references at all.  Therefore we
1901 * can assume the socket waitqueue is inactive and nobody will
1902 * try to jump onto it.
1903 */
1904void tcp_destroy_sock(struct sock *sk)
1905{
1906        BUG_TRAP(sk->sk_state == TCP_CLOSE);
1907        BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1908
1909        /* It cannot be in hash table! */
1910        BUG_TRAP(sk_unhashed(sk));
1911
1912        /* If it has not 0 inet_sk(sk)->num, it must be bound */
1913        BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1914
1915#ifdef TCP_DEBUG
1916        if (sk->sk_zapped) {
1917                printk(KERN_DEBUG "TCP: double destroy sk=%p\n", sk);
1918                sock_hold(sk);
1919        }
1920        sk->sk_zapped = 1;
1921#endif
1922
1923        sk->sk_prot->destroy(sk);
1924
1925        tcp_kill_sk_queues(sk);
1926
1927        xfrm_sk_free_policy(sk);
1928
1929#ifdef INET_REFCNT_DEBUG
1930        if (atomic_read(&sk->sk_refcnt) != 1) {
1931                printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1932                       sk, atomic_read(&sk->sk_refcnt));
1933        }
1934#endif
1935
1936        atomic_dec(&tcp_orphan_count);
1937        sock_put(sk);
1938}
1939
1940void tcp_close(struct sock *sk, long timeout)
1941{
1942        struct sk_buff *skb;
1943        int data_was_unread = 0;
1944
1945        lock_sock(sk);
1946        sk->sk_shutdown = SHUTDOWN_MASK;
1947
1948        if (sk->sk_state == TCP_LISTEN) {
1949                tcp_set_state(sk, TCP_CLOSE);
1950
1951                /* Special case. */
1952                tcp_listen_stop(sk);
1953
1954                goto adjudge_to_death;
1955        }
1956
1957        /*  We need to flush the recv. buffs.  We do this only on the
1958         *  descriptor close, not protocol-sourced closes, because the
1959         *  reader process may not have drained the data yet!
1960         */
1961        while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1962                u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1963                          skb->h.th->fin;
1964                data_was_unread += len;
1965                __kfree_skb(skb);
1966        }
1967
1968        tcp_mem_reclaim(sk);
1969
1970        /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1971         * 3.10, we send a RST here because data was lost.  To
1972         * witness the awful effects of the old behavior of always
1973         * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1974         * a bulk GET in an FTP client, suspend the process, wait
1975         * for the client to advertise a zero window, then kill -9
1976         * the FTP client, wheee...  Note: timeout is always zero
1977         * in such a case.
1978         */
1979        if (data_was_unread) {
1980                /* Unread data was tossed, zap the connection. */
1981                NET_INC_STATS_USER(TCPAbortOnClose);
1982                tcp_set_state(sk, TCP_CLOSE);
1983                tcp_send_active_reset(sk, GFP_KERNEL);
1984        } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1985                /* Check zero linger _after_ checking for unread data. */
1986                sk->sk_prot->disconnect(sk, 0);
1987                NET_INC_STATS_USER(TCPAbortOnData);
1988        } else if (tcp_close_state(sk)) {
1989                /* We FIN if the application ate all the data before
1990                 * zapping the connection.
1991                 */
1992
1993                /* RED-PEN. Formally speaking, we have broken TCP state
1994                 * machine. State transitions:
1995                 *
1996                 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1997                 * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1998                 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1999                 *
2000                 * are legal only when FIN has been sent (i.e. in window),
2001                 * rather than queued out of window. Purists blame.
2002                 *
2003                 * F.e. "RFC state" is ESTABLISHED,
2004                 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
2005                 *
2006                 * The visible declinations are that sometimes
2007                 * we enter time-wait state, when it is not required really
2008                 * (harmless), do not send active resets, when they are
2009                 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
2010                 * they look as CLOSING or LAST_ACK for Linux)
2011                 * Probably, I missed some more holelets.
2012                 *                                              --ANK
2013                 */
2014                tcp_send_fin(sk);
2015        }
2016
2017        if (timeout) {
2018                struct task_struct *tsk = current;
2019                DEFINE_WAIT(wait);
2020
2021                do {
2022                        prepare_to_wait(sk->sk_sleep, &wait,
2023                                        TASK_INTERRUPTIBLE);
2024                        if (!closing(sk))
2025                                break;
2026                        release_sock(sk);
2027                        timeout = schedule_timeout(timeout);
2028                        lock_sock(sk);
2029                } while (!signal_pending(tsk) && timeout);
2030
2031                finish_wait(sk->sk_sleep, &wait);
2032        }
2033
2034adjudge_to_death:
2035        /* It is the last release_sock in its life. It will remove backlog. */
2036        release_sock(sk);
2037
2038
2039        /* Now socket is owned by kernel and we acquire BH lock
2040           to finish close. No need to check for user refs.
2041         */
2042        local_bh_disable();
2043        bh_lock_sock(sk);
2044        BUG_TRAP(!sock_owned_by_user(sk));
2045
2046        sock_hold(sk);
2047        sock_orphan(sk);
2048
2049        /*      This is a (useful) BSD violating of the RFC. There is a
2050         *      problem with TCP as specified in that the other end could
2051         *      keep a socket open forever with no application left this end.
2052         *      We use a 3 minute timeout (about the same as BSD) then kill
2053         *      our end. If they send after that then tough - BUT: long enough
2054         *      that we won't make the old 4*rto = almost no time - whoops
2055         *      reset mistake.
2056         *
2057         *      Nope, it was not mistake. It is really desired behaviour
2058         *      f.e. on http servers, when such sockets are useless, but
2059         *      consume significant resources. Let's do it with special
2060         *      linger2 option.                                 --ANK
2061         */
2062
2063        if (sk->sk_state == TCP_FIN_WAIT2) {
2064                struct tcp_opt *tp = tcp_sk(sk);
2065                if (tp->linger2 < 0) {
2066                        tcp_set_state(sk, TCP_CLOSE);
2067                        tcp_send_active_reset(sk, GFP_ATOMIC);
2068                        NET_INC_STATS_BH(TCPAbortOnLinger);
2069                } else {
2070                        int tmo = tcp_fin_time(tp);
2071
2072                        if (tmo > TCP_TIMEWAIT_LEN) {
2073                                tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
2074                        } else {
2075                                atomic_inc(&tcp_orphan_count);
2076                                tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2077                                goto out;
2078                        }
2079                }
2080        }
2081        if (sk->sk_state != TCP_CLOSE) {
2082                tcp_mem_reclaim(sk);
2083                if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
2084                    (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
2085                     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
2086                        if (net_ratelimit())
2087                                printk(KERN_INFO "TCP: too many of orphaned "
2088                                       "sockets\n");
2089                        tcp_set_state(sk, TCP_CLOSE);
2090                        tcp_send_active_reset(sk, GFP_ATOMIC);
2091                        NET_INC_STATS_BH(TCPAbortOnMemory);
2092                }
2093        }
2094        atomic_inc(&tcp_orphan_count);
2095
2096        if (sk->sk_state == TCP_CLOSE)
2097                tcp_destroy_sock(sk);
2098        /* Otherwise, socket is reprieved until protocol close. */
2099
2100out:
2101        bh_unlock_sock(sk);
2102        local_bh_enable();
2103        sock_put(sk);
2104}
2105
2106/* These states need RST on ABORT according to RFC793 */
2107
2108static inline int tcp_need_reset(int state)
2109{
2110        return (1 << state) &
2111               (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2112                TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2113}
2114
2115int tcp_disconnect(struct sock *sk, int flags)
2116{
2117        struct inet_opt *inet = inet_sk(sk);
2118        struct tcp_opt *tp = tcp_sk(sk);
2119        int err = 0;
2120        int old_state = sk->sk_state;
2121
2122        if (old_state != TCP_CLOSE)
2123                tcp_set_state(sk, TCP_CLOSE);
2124
2125        /* ABORT function of RFC793 */
2126        if (old_state == TCP_LISTEN) {
2127                tcp_listen_stop(sk);
2128        } else if (tcp_need_reset(old_state) ||
2129                   (tp->snd_nxt != tp->write_seq &&
2130                    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2131                /* The last check adjusts for discrepance of Linux wrt. RFC
2132                 * states
2133                 */
2134                tcp_send_active_reset(sk, gfp_any());
2135                sk->sk_err = ECONNRESET;
2136        } else if (old_state == TCP_SYN_SENT)
2137                sk->sk_err = ECONNRESET;
2138
2139        tcp_clear_xmit_timers(sk);
2140        __skb_queue_purge(&sk->sk_receive_queue);
2141        tcp_writequeue_purge(sk);
2142        __skb_queue_purge(&tp->out_of_order_queue);
2143
2144        inet->dport = 0;
2145
2146        if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2147                inet_reset_saddr(sk);
2148
2149        sk->sk_shutdown = 0;
2150        sock_reset_flag(sk, SOCK_DONE);
2151        tp->srtt = 0;
2152        if ((tp->write_seq += tp->max_window + 2) == 0)
2153                tp->write_seq = 1;
2154        tp->backoff = 0;
2155        tp->snd_cwnd = 2;
2156        tp->probes_out = 0;
2157        tp->packets_out = 0;
2158        tp->snd_ssthresh = 0x7fffffff;
2159        tp->snd_cwnd_cnt = 0;
2160        tp->ca_state = TCP_CA_Open;
2161        tcp_clear_retrans(tp);
2162        tcp_delack_init(tp);
2163        tp->send_head = NULL;
2164        tp->saw_tstamp = 0;
2165        tcp_sack_reset(tp);
2166        __sk_dst_reset(sk);
2167
2168        BUG_TRAP(!inet->num || tp->bind_hash);
2169
2170        sk->sk_error_report(sk);
2171        return err;
2172}
2173
2174/*
2175 *      Wait for an incoming connection, avoid race
2176 *      conditions. This must be called with the socket locked.
2177 */
2178static int wait_for_connect(struct sock *sk, long timeo)
2179{
2180        struct tcp_opt *tp = tcp_sk(sk);
2181        DEFINE_WAIT(wait);
2182        int err;
2183
2184        /*
2185         * True wake-one mechanism for incoming connections: only
2186         * one process gets woken up, not the 'whole herd'.
2187         * Since we do not 'race & poll' for established sockets
2188         * anymore, the common case will execute the loop only once.
2189         *
2190         * Subtle issue: "add_wait_queue_exclusive()" will be added
2191         * after any current non-exclusive waiters, and we know that
2192         * it will always _stay_ after any new non-exclusive waiters
2193         * because all non-exclusive waiters are added at the
2194         * beginning of the wait-queue. As such, it's ok to "drop"
2195         * our exclusiveness temporarily when we get woken up without
2196         * having to remove and re-insert us on the wait queue.
2197         */
2198        for (;;) {
2199                prepare_to_wait_exclusive(sk->sk_sleep, &wait,
2200                                          TASK_INTERRUPTIBLE);
2201                release_sock(sk);
2202                if (!tp->accept_queue)
2203                        timeo = schedule_timeout(timeo);
2204                lock_sock(sk);
2205                err = 0;
2206                if (tp->accept_queue)
2207                        break;
2208                err = -EINVAL;
2209                if (sk->sk_state != TCP_LISTEN)
2210                        break;
2211                err = sock_intr_errno(timeo);
2212                if (signal_pending(current))
2213                        break;
2214                err = -EAGAIN;
2215                if (!timeo)
2216                        break;
2217        }
2218        finish_wait(sk->sk_sleep, &wait);
2219        return err;
2220}
2221
2222/*
2223 *      This will accept the next outstanding connection.
2224 */
2225
2226struct sock *tcp_accept(struct sock *sk, int flags, int *err)
2227{
2228        struct tcp_opt *tp = tcp_sk(sk);
2229        struct open_request *req;
2230        struct sock *newsk;
2231        int error;
2232
2233        lock_sock(sk);
2234
2235        /* We need to make sure that this socket is listening,
2236         * and that it has something pending.
2237         */
2238        error = -EINVAL;
2239        if (sk->sk_state != TCP_LISTEN)
2240                goto out;
2241
2242        /* Find already established connection */
2243        if (!tp->accept_queue) {
2244                long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
2245
2246                /* If this is a non blocking socket don't sleep */
2247                error = -EAGAIN;
2248                if (!timeo)
2249                        goto out;
2250
2251                error = wait_for_connect(sk, timeo);
2252                if (error)
2253                        goto out;
2254        }
2255
2256        req = tp->accept_queue;
2257        if ((tp->accept_queue = req->dl_next) == NULL)
2258                tp->accept_queue_tail = NULL;
2259
2260        newsk = req->sk;
2261        tcp_acceptq_removed(sk);
2262        tcp_openreq_fastfree(req);
2263        BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
2264        release_sock(sk);
2265        return newsk;
2266
2267out:
2268        release_sock(sk);
2269        *err = error;
2270        return NULL;
2271}
2272
2273/*
2274 *      Socket option code for TCP.
2275 */
2276int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
2277                   int optlen)
2278{
2279        struct tcp_opt *tp = tcp_sk(sk);
2280        int val;
2281        int err = 0;
2282
2283        if (level != SOL_TCP)
2284                return tp->af_specific->setsockopt(sk, level, optname,
2285                                                   optval, optlen);
2286
2287        if (optlen < sizeof(int))
2288                return -EINVAL;
2289
2290        if (get_user(val, (int *)optval))
2291                return -EFAULT;
2292
2293        lock_sock(sk);
2294
2295        switch (optname) {
2296        case TCP_MAXSEG:
2297                /* Values greater than interface MTU won't take effect. However
2298                 * at the point when this call is done we typically don't yet
2299                 * know which interface is going to be used */
2300                if (val < 8 || val > MAX_TCP_WINDOW) {
2301                        err = -EINVAL;
2302                        break;
2303                }
2304                tp->user_mss = val;
2305                break;
2306
2307        case TCP_NODELAY:
2308                if (val) {
2309                        /* TCP_NODELAY is weaker than TCP_CORK, so that
2310                         * this option on corked socket is remembered, but
2311                         * it is not activated until cork is cleared.
2312                         *
2313                         * However, when TCP_NODELAY is set we make
2314                         * an explicit push, which overrides even TCP_CORK
2315                         * for currently queued segments.
2316                         */
2317                        tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2318                        tcp_push_pending_frames(sk, tp);
2319                } else {
2320                        tp->nonagle &= ~TCP_NAGLE_OFF;
2321                }
2322                break;
2323
2324        case TCP_CORK:
2325                /* When set indicates to always queue non-full frames.
2326                 * Later the user clears this option and we transmit
2327                 * any pending partial frames in the queue.  This is
2328                 * meant to be used alongside sendfile() to get properly
2329                 * filled frames when the user (for example) must write
2330                 * out headers with a write() call first and then use
2331                 * sendfile to send out the data parts.
2332                 *
2333                 * TCP_CORK can be set together with TCP_NODELAY and it is
2334                 * stronger than TCP_NODELAY.
2335                 */
2336                if (val) {
2337                        tp->nonagle |= TCP_NAGLE_CORK;
2338                } else {
2339                        tp->nonagle &= ~TCP_NAGLE_CORK;
2340                        if (tp->nonagle&TCP_NAGLE_OFF)
2341                                tp->nonagle |= TCP_NAGLE_PUSH;
2342                        tcp_push_pending_frames(sk, tp);
2343                }
2344                break;
2345
2346        case TCP_KEEPIDLE:
2347                if (val < 1 || val > MAX_TCP_KEEPIDLE)
2348                        err = -EINVAL;
2349                else {
2350                        tp->keepalive_time = val * HZ;
2351                        if (sock_flag(sk, SOCK_KEEPOPEN) &&
2352                            !((1 << sk->sk_state) &
2353                              (TCPF_CLOSE | TCPF_LISTEN))) {
2354                                __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2355                                if (tp->keepalive_time > elapsed)
2356                                        elapsed = tp->keepalive_time - elapsed;
2357                                else
2358                                        elapsed = 0;
2359                                tcp_reset_keepalive_timer(sk, elapsed);
2360                        }
2361                }
2362                break;
2363        case TCP_KEEPINTVL:
2364                if (val < 1 || val > MAX_TCP_KEEPINTVL)
2365                        err = -EINVAL;
2366                else
2367                        tp->keepalive_intvl = val * HZ;
2368                break;
2369        case TCP_KEEPCNT:
2370                if (val < 1 || val > MAX_TCP_KEEPCNT)
2371                        err = -EINVAL;
2372                else
2373                        tp->keepalive_probes = val;
2374                break;
2375        case TCP_SYNCNT:
2376                if (val < 1 || val > MAX_TCP_SYNCNT)
2377                        err = -EINVAL;
2378                else
2379                        tp->syn_retries = val;
2380                break;
2381
2382        case TCP_LINGER2:
2383                if (val < 0)
2384                        tp->linger2 = -1;
2385                else if (val > sysctl_tcp_fin_timeout / HZ)
2386                        tp->linger2 = 0;
2387                else
2388                        tp->linger2 = val * HZ;
2389                break;
2390
2391        case TCP_DEFER_ACCEPT:
2392                tp->defer_accept = 0;
2393                if (val > 0) {
2394                        /* Translate value in seconds to number of
2395                         * retransmits */
2396                        while (tp->defer_accept < 32 &&
2397                               val > ((TCP_TIMEOUT_INIT / HZ) <<
2398                                       tp->defer_accept))
2399                                tp->defer_accept++;
2400                        tp->defer_accept++;
2401                }
2402                break;
2403
2404        case TCP_WINDOW_CLAMP:
2405                if (!val) {
2406                        if (sk->sk_state != TCP_CLOSE) {
2407                                err = -EINVAL;
2408                                break;
2409                        }
2410                        tp->window_clamp = 0;
2411                } else
2412                        tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2413                                                SOCK_MIN_RCVBUF / 2 : val;
2414                break;
2415
2416        case TCP_QUICKACK:
2417                if (!val) {
2418                        tp->ack.pingpong = 1;
2419                } else {
2420                        tp->ack.pingpong = 0;
2421                        if ((1 << sk->sk_state) &
2422                            (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2423                            tcp_ack_scheduled(tp)) {
2424                                tp->ack.pending |= TCP_ACK_PUSHED;
2425                                cleanup_rbuf(sk, 1);
2426                                if (!(val & 1))
2427                                        tp->ack.pingpong = 1;
2428                        }
2429                }
2430                break;
2431
2432        default:
2433                err = -ENOPROTOOPT;
2434                break;
2435        };
2436        release_sock(sk);
2437        return err;
2438}
2439
2440int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
2441                   int *optlen)
2442{
2443        struct tcp_opt *tp = tcp_sk(sk);
2444        int val, len;
2445
2446        if (level != SOL_TCP)
2447                return tp->af_specific->getsockopt(sk, level, optname,
2448                                                   optval, optlen);
2449
2450        if (get_user(len, optlen))
2451                return -EFAULT;
2452
2453        len = min_t(unsigned int, len, sizeof(int));
2454
2455        if (len < 0)
2456                return -EINVAL;
2457
2458        switch (optname) {
2459        case TCP_MAXSEG:
2460                val = tp->mss_cache_std;
2461                if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2462                        val = tp->user_mss;
2463                break;
2464        case TCP_NODELAY:
2465                val = !!(tp->nonagle&TCP_NAGLE_OFF);
2466                break;
2467        case TCP_CORK:
2468                val = !!(tp->nonagle&TCP_NAGLE_CORK);
2469                break;
2470        case TCP_KEEPIDLE:
2471                val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2472                break;
2473        case TCP_KEEPINTVL:
2474                val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2475                break;
2476        case TCP_KEEPCNT:
2477                val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2478                break;
2479        case TCP_SYNCNT:
2480                val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2481                break;
2482        case TCP_LINGER2:
2483                val = tp->linger2;
2484                if (val >= 0)
2485                        val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2486                break;
2487        case TCP_DEFER_ACCEPT:
2488                val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2489                                               (tp->defer_accept - 1));
2490                break;
2491        case TCP_WINDOW_CLAMP:
2492                val = tp->window_clamp;
2493                break;
2494        case TCP_INFO: {
2495                struct tcp_info info;
2496                u32 now = tcp_time_stamp;
2497
2498                if (get_user(len, optlen))
2499                        return -EFAULT;
2500                info.tcpi_state = sk->sk_state;
2501                info.tcpi_ca_state = tp->ca_state;
2502                info.tcpi_retransmits = tp->retransmits;
2503                info.tcpi_probes = tp->probes_out;
2504                info.tcpi_backoff = tp->backoff;
2505                info.tcpi_options = 0;
2506                if (tp->tstamp_ok)
2507                        info.tcpi_options |= TCPI_OPT_TIMESTAMPS;
2508                if (tp->sack_ok)
2509                        info.tcpi_options |= TCPI_OPT_SACK;
2510                if (tp->wscale_ok) {
2511                        info.tcpi_options |= TCPI_OPT_WSCALE;
2512                        info.tcpi_snd_wscale = tp->snd_wscale;
2513                        info.tcpi_rcv_wscale = tp->rcv_wscale;
2514                } else {
2515                        info.tcpi_snd_wscale = 0;
2516                        info.tcpi_rcv_wscale = 0;
2517                }
2518                if (tp->ecn_flags & TCP_ECN_OK)
2519                        info.tcpi_options |= TCPI_OPT_ECN;
2520
2521                info.tcpi_rto = (1000000 * tp->rto) / HZ;
2522                info.tcpi_ato = (1000000 * tp->ack.ato) / HZ;
2523                info.tcpi_snd_mss = tp->mss_cache_std;
2524                info.tcpi_rcv_mss = tp->ack.rcv_mss;
2525
2526                info.tcpi_unacked = tp->packets_out;
2527                info.tcpi_sacked = tp->sacked_out;
2528                info.tcpi_lost = tp->lost_out;
2529                info.tcpi_retrans = tp->retrans_out;
2530                info.tcpi_fackets = tp->fackets_out;
2531
2532                info.tcpi_last_data_sent = ((now - tp->lsndtime) * 1000) / HZ;
2533                info.tcpi_last_ack_sent = 0;
2534                info.tcpi_last_data_recv = ((now -
2535                                             tp->ack.lrcvtime) * 1000) / HZ;
2536                info.tcpi_last_ack_recv = ((now - tp->rcv_tstamp) * 1000) / HZ;
2537
2538                info.tcpi_pmtu = tp->pmtu_cookie;
2539                info.tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2540                info.tcpi_rtt = ((1000000 * tp->srtt) / HZ) >> 3;
2541                info.tcpi_rttvar = ((1000000 * tp->mdev) / HZ) >> 2;
2542                info.tcpi_snd_ssthresh = tp->snd_ssthresh;
2543                info.tcpi_snd_cwnd = tp->snd_cwnd;
2544                info.tcpi_advmss = tp->advmss;
2545                info.tcpi_reordering = tp->reordering;
2546
2547                len = min_t(unsigned int, len, sizeof(info));
2548                if (put_user(len, optlen))
2549                        return -EFAULT;
2550                if (copy_to_user(optval, &info, len))
2551                        return -EFAULT;
2552                return 0;
2553        }
2554        case TCP_QUICKACK:
2555                val = !tp->ack.pingpong;
2556                break;
2557        default:
2558                return -ENOPROTOOPT;
2559        };
2560
2561        if (put_user(len, optlen))
2562                return -EFAULT;
2563        if (copy_to_user(optval, &val, len))
2564                return -EFAULT;
2565        return 0;
2566}
2567
2568
2569extern void __skb_cb_too_small_for_tcp(int, int);
2570extern void tcpdiag_init(void);
2571
2572void __init tcp_init(void)
2573{
2574        struct sk_buff *skb = NULL;
2575        unsigned long goal;
2576        int order, i;
2577
2578        if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2579                __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2580                                           sizeof(skb->cb));
2581
2582        tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2583                                                   sizeof(struct open_request),
2584                                               0, SLAB_HWCACHE_ALIGN,
2585                                               NULL, NULL);
2586        if (!tcp_openreq_cachep)
2587                panic("tcp_init: Cannot alloc open_request cache.");
2588
2589        tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2590                                              sizeof(struct tcp_bind_bucket),
2591                                              0, SLAB_HWCACHE_ALIGN,
2592                                              NULL, NULL);
2593        if (!tcp_bucket_cachep)
2594                panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2595
2596        tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2597                                                sizeof(struct tcp_tw_bucket),
2598                                                0, SLAB_HWCACHE_ALIGN,
2599                                                NULL, NULL);
2600        if (!tcp_timewait_cachep)
2601                panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2602
2603        /* Size and allocate the main established and bind bucket
2604         * hash tables.
2605         *
2606         * The methodology is similar to that of the buffer cache.
2607         */
2608        if (num_physpages >= (128 * 1024))
2609                goal = num_physpages >> (21 - PAGE_SHIFT);
2610        else
2611                goal = num_physpages >> (23 - PAGE_SHIFT);
2612
2613        for (order = 0; (1UL << order) < goal; order++)
2614                ;
2615        do {
2616                tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2617                        sizeof(struct tcp_ehash_bucket);
2618                tcp_ehash_size >>= 1;
2619                while (tcp_ehash_size & (tcp_ehash_size - 1))
2620                        tcp_ehash_size--;
2621                tcp_ehash = (struct tcp_ehash_bucket *)
2622                        __get_free_pages(GFP_ATOMIC, order);
2623        } while (!tcp_ehash && --order > 0);
2624
2625        if (!tcp_ehash)
2626                panic("Failed to allocate TCP established hash table\n");
2627        for (i = 0; i < (tcp_ehash_size << 1); i++) {
2628                tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2629                INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2630        }
2631
2632        do {
2633                tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2634                        sizeof(struct tcp_bind_hashbucket);
2635                if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2636                        continue;
2637                tcp_bhash = (struct tcp_bind_hashbucket *)
2638                        __get_free_pages(GFP_ATOMIC, order);
2639        } while (!tcp_bhash && --order >= 0);
2640
2641        if (!tcp_bhash)
2642                panic("Failed to allocate TCP bind hash table\n");
2643        for (i = 0; i < tcp_bhash_size; i++) {
2644                tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2645                INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2646        }
2647
2648        /* Try to be a bit smarter and adjust defaults depending
2649         * on available memory.
2650         */
2651        if (order > 4) {
2652                sysctl_local_port_range[0] = 32768;
2653                sysctl_local_port_range[1] = 61000;
2654                sysctl_tcp_max_tw_buckets = 180000;
2655                sysctl_tcp_max_orphans = 4096 << (order - 4);
2656                sysctl_max_syn_backlog = 1024;
2657        } else if (order < 3) {
2658                sysctl_local_port_range[0] = 1024 * (3 - order);
2659                sysctl_tcp_max_tw_buckets >>= (3 - order);
2660                sysctl_tcp_max_orphans >>= (3 - order);
2661                sysctl_max_syn_backlog = 128;
2662        }
2663        tcp_port_rover = sysctl_local_port_range[0] - 1;
2664
2665        sysctl_tcp_mem[0] =  768 << order;
2666        sysctl_tcp_mem[1] = 1024 << order;
2667        sysctl_tcp_mem[2] = 1536 << order;
2668        if (sysctl_tcp_mem[2] - sysctl_tcp_mem[1] > 512)
2669                sysctl_tcp_mem[1] = sysctl_tcp_mem[2] - 512;
2670        if (sysctl_tcp_mem[1] - sysctl_tcp_mem[0] > 512)
2671                sysctl_tcp_mem[0] = sysctl_tcp_mem[1] - 512;
2672
2673        if (order < 3) {
2674                sysctl_tcp_wmem[2] = 64 * 1024;
2675                sysctl_tcp_rmem[0] = PAGE_SIZE;
2676                sysctl_tcp_rmem[1] = 43689;
2677                sysctl_tcp_rmem[2] = 2 * 43689;
2678        }
2679
2680        printk(KERN_INFO "TCP: Hash tables configured "
2681               "(established %d bind %d)\n",
2682               tcp_ehash_size << 1, tcp_bhash_size);
2683
2684        tcpdiag_init();
2685}
2686
2687EXPORT_SYMBOL(__tcp_mem_reclaim);
2688EXPORT_SYMBOL(sysctl_tcp_rmem);
2689EXPORT_SYMBOL(sysctl_tcp_wmem);
2690EXPORT_SYMBOL(tcp_accept);
2691EXPORT_SYMBOL(tcp_close);
2692EXPORT_SYMBOL(tcp_close_state);
2693EXPORT_SYMBOL(tcp_destroy_sock);
2694EXPORT_SYMBOL(tcp_disconnect);
2695EXPORT_SYMBOL(tcp_getsockopt);
2696EXPORT_SYMBOL(tcp_ioctl);
2697EXPORT_SYMBOL(tcp_openreq_cachep);
2698EXPORT_SYMBOL(tcp_poll);
2699EXPORT_SYMBOL(tcp_read_sock);
2700EXPORT_SYMBOL(tcp_recvmsg);
2701EXPORT_SYMBOL(tcp_sendmsg);
2702EXPORT_SYMBOL(tcp_sendpage);
2703EXPORT_SYMBOL(tcp_setsockopt);
2704EXPORT_SYMBOL(tcp_shutdown);
2705EXPORT_SYMBOL(tcp_sockets_allocated);
2706EXPORT_SYMBOL(tcp_statistics);
2707EXPORT_SYMBOL(tcp_timewait_cachep);
2708EXPORT_SYMBOL(tcp_write_space);
2709
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.