linux/net/ipv4/tcp.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14 *              Florian La Roche, <flla@stud.uni-sb.de>
  15 *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16 *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18 *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19 *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20 *              Jorge Cwik, <jorge@laser.satlink.net>
  21 *
  22 * Fixes:
  23 *              Alan Cox        :       Numerous verify_area() calls
  24 *              Alan Cox        :       Set the ACK bit on a reset
  25 *              Alan Cox        :       Stopped it crashing if it closed while
  26 *                                      sk->inuse=1 and was trying to connect
  27 *                                      (tcp_err()).
  28 *              Alan Cox        :       All icmp error handling was broken
  29 *                                      pointers passed where wrong and the
  30 *                                      socket was looked up backwards. Nobody
  31 *                                      tested any icmp error code obviously.
  32 *              Alan Cox        :       tcp_err() now handled properly. It
  33 *                                      wakes people on errors. poll
  34 *                                      behaves and the icmp error race
  35 *                                      has gone by moving it into sock.c
  36 *              Alan Cox        :       tcp_send_reset() fixed to work for
  37 *                                      everything not just packets for
  38 *                                      unknown sockets.
  39 *              Alan Cox        :       tcp option processing.
  40 *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  41 *                                      syn rule wrong]
  42 *              Herp Rosmanith  :       More reset fixes
  43 *              Alan Cox        :       No longer acks invalid rst frames.
  44 *                                      Acking any kind of RST is right out.
  45 *              Alan Cox        :       Sets an ignore me flag on an rst
  46 *                                      receive otherwise odd bits of prattle
  47 *                                      escape still
  48 *              Alan Cox        :       Fixed another acking RST frame bug.
  49 *                                      Should stop LAN workplace lockups.
  50 *              Alan Cox        :       Some tidyups using the new skb list
  51 *                                      facilities
  52 *              Alan Cox        :       sk->keepopen now seems to work
  53 *              Alan Cox        :       Pulls options out correctly on accepts
  54 *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55 *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56 *                                      bit to skb ops.
  57 *              Alan Cox        :       Tidied tcp_data to avoid a potential
  58 *                                      nasty.
  59 *              Alan Cox        :       Added some better commenting, as the
  60 *                                      tcp is hard to follow
  61 *              Alan Cox        :       Removed incorrect check for 20 * psh
  62 *      Michael O'Reilly        :       ack < copied bug fix.
  63 *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64 *              Alan Cox        :       FIN with no memory -> CRASH
  65 *              Alan Cox        :       Added socket option proto entries.
  66 *                                      Also added awareness of them to accept.
  67 *              Alan Cox        :       Added TCP options (SOL_TCP)
  68 *              Alan Cox        :       Switched wakeup calls to callbacks,
  69 *                                      so the kernel can layer network
  70 *                                      sockets.
  71 *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72 *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73 *              Alan Cox        :       RST frames sent on unsynchronised
  74 *                                      state ack error.
  75 *              Alan Cox        :       Put in missing check for SYN bit.
  76 *              Alan Cox        :       Added tcp_select_window() aka NET2E
  77 *                                      window non shrink trick.
  78 *              Alan Cox        :       Added a couple of small NET2E timer
  79 *                                      fixes
  80 *              Charles Hedrick :       TCP fixes
  81 *              Toomas Tamm     :       TCP window fixes
  82 *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83 *              Charles Hedrick :       Rewrote most of it to actually work
  84 *              Linus           :       Rewrote tcp_read() and URG handling
  85 *                                      completely
  86 *              Gerhard Koerting:       Fixed some missing timer handling
  87 *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88 *              Gerhard Koerting:       PC/TCP workarounds
  89 *              Adam Caldwell   :       Assorted timer/timing errors
  90 *              Matthew Dillon  :       Fixed another RST bug
  91 *              Alan Cox        :       Move to kernel side addressing changes.
  92 *              Alan Cox        :       Beginning work on TCP fastpathing
  93 *                                      (not yet usable)
  94 *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95 *              Alan Cox        :       TCP fast path debugging
  96 *              Alan Cox        :       Window clamping
  97 *              Michael Riepe   :       Bug in tcp_check()
  98 *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99 *              Matt Dillon     :       Yet more small nasties remove from the
 100 *                                      TCP code (Be very nice to this man if
 101 *                                      tcp finally works 100%) 8)
 102 *              Alan Cox        :       BSD accept semantics.
 103 *              Alan Cox        :       Reset on closedown bug.
 104 *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105 *              Michael Pall    :       Handle poll() after URG properly in
 106 *                                      all cases.
 107 *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 108 *                                      (multi URG PUSH broke rlogin).
 109 *              Michael Pall    :       Fix the multi URG PUSH problem in
 110 *                                      tcp_readable(), poll() after URG
 111 *                                      works now.
 112 *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 113 *                                      BSD api.
 114 *              Alan Cox        :       Changed the semantics of sk->socket to
 115 *                                      fix a race and a signal problem with
 116 *                                      accept() and async I/O.
 117 *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118 *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119 *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120 *                                      clients/servers which listen in on
 121 *                                      fixed ports.
 122 *              Alan Cox        :       Cleaned the above up and shrank it to
 123 *                                      a sensible code size.
 124 *              Alan Cox        :       Self connect lockup fix.
 125 *              Alan Cox        :       No connect to multicast.
 126 *              Ross Biro       :       Close unaccepted children on master
 127 *                                      socket close.
 128 *              Alan Cox        :       Reset tracing code.
 129 *              Alan Cox        :       Spurious resets on shutdown.
 130 *              Alan Cox        :       Giant 15 minute/60 second timer error
 131 *              Alan Cox        :       Small whoops in polling before an
 132 *                                      accept.
 133 *              Alan Cox        :       Kept the state trace facility since
 134 *                                      it's handy for debugging.
 135 *              Alan Cox        :       More reset handler fixes.
 136 *              Alan Cox        :       Started rewriting the code based on
 137 *                                      the RFC's for other useful protocol
 138 *                                      references see: Comer, KA9Q NOS, and
 139 *                                      for a reference on the difference
 140 *                                      between specifications and how BSD
 141 *                                      works see the 4.4lite source.
 142 *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 143 *                                      close.
 144 *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145 *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146 *              Alan Cox        :       Reimplemented timers as per the RFC
 147 *                                      and using multiple timers for sanity.
 148 *              Alan Cox        :       Small bug fixes, and a lot of new
 149 *                                      comments.
 150 *              Alan Cox        :       Fixed dual reader crash by locking
 151 *                                      the buffers (much like datagram.c)
 152 *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153 *                                      now gets fed up of retrying without
 154 *                                      (even a no space) answer.
 155 *              Alan Cox        :       Extracted closing code better
 156 *              Alan Cox        :       Fixed the closing state machine to
 157 *                                      resemble the RFC.
 158 *              Alan Cox        :       More 'per spec' fixes.
 159 *              Jorge Cwik      :       Even faster checksumming.
 160 *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161 *                                      only frames. At least one pc tcp stack
 162 *                                      generates them.
 163 *              Alan Cox        :       Cache last socket.
 164 *              Alan Cox        :       Per route irtt.
 165 *              Matt Day        :       poll()->select() match BSD precisely on error
 166 *              Alan Cox        :       New buffers
 167 *              Marc Tamsky     :       Various sk->prot->retransmits and
 168 *                                      sk->retransmits misupdating fixed.
 169 *                                      Fixed tcp_write_timeout: stuck close,
 170 *                                      and TCP syn retries gets used now.
 171 *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172 *                                      ack if state is TCP_CLOSED.
 173 *              Alan Cox        :       Look up device on a retransmit - routes may
 174 *                                      change. Doesn't yet cope with MSS shrink right
 175 *                                      but it's a start!
 176 *              Marc Tamsky     :       Closing in closing fixes.
 177 *              Mike Shaver     :       RFC1122 verifications.
 178 *              Alan Cox        :       rcv_saddr errors.
 179 *              Alan Cox        :       Block double connect().
 180 *              Alan Cox        :       Small hooks for enSKIP.
 181 *              Alexey Kuznetsov:       Path MTU discovery.
 182 *              Alan Cox        :       Support soft errors.
 183 *              Alan Cox        :       Fix MTU discovery pathological case
 184 *                                      when the remote claims no mtu!
 185 *              Marc Tamsky     :       TCP_CLOSE fix.
 186 *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 187 *                                      window but wrong (fixes NT lpd problems)
 188 *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189 *              Joerg Reuter    :       No modification of locked buffers in
 190 *                                      tcp_do_retransmit()
 191 *              Eric Schenk     :       Changed receiver side silly window
 192 *                                      avoidance algorithm to BSD style
 193 *                                      algorithm. This doubles throughput
 194 *                                      against machines running Solaris,
 195 *                                      and seems to result in general
 196 *                                      improvement.
 197 *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 198 *      Willy Konynenberg       :       Transparent proxying support.
 199 *      Mike McLagan            :       Routing by source
 200 *              Keith Owens     :       Do proper merging with partial SKB's in
 201 *                                      tcp_do_sendmsg to avoid burstiness.
 202 *              Eric Schenk     :       Fix fast close down bug with
 203 *                                      shutdown() followed by close().
 204 *              Andi Kleen      :       Make poll agree with SIGIO
 205 *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
 206 *                                      lingertime == 0 (RFC 793 ABORT Call)
 207 *      Hirokazu Takahashi      :       Use copy_from_user() instead of
 208 *                                      csum_and_copy_from_user() if possible.
 209 *
 210 *              This program is free software; you can redistribute it and/or
 211 *              modify it under the terms of the GNU General Public License
 212 *              as published by the Free Software Foundation; either version
 213 *              2 of the License, or(at your option) any later version.
 214 *
 215 * Description of States:
 216 *
 217 *      TCP_SYN_SENT            sent a connection request, waiting for ack
 218 *
 219 *      TCP_SYN_RECV            received a connection request, sent ack,
 220 *                              waiting for final ack in three-way handshake.
 221 *
 222 *      TCP_ESTABLISHED         connection established
 223 *
 224 *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 225 *                              transmission of remaining buffered data
 226 *
 227 *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 228 *                              to shutdown
 229 *
 230 *      TCP_CLOSING             both sides have shutdown but we still have
 231 *                              data we have to finish sending
 232 *
 233 *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 234 *                              closed, can only be entered from FIN_WAIT2
 235 *                              or CLOSING.  Required because the other end
 236 *                              may not have gotten our last ACK causing it
 237 *                              to retransmit the data packet (which we ignore)
 238 *
 239 *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 240 *                              us to finish writing our data and to shutdown
 241 *                              (we have to close() to move on to LAST_ACK)
 242 *
 243 *      TCP_LAST_ACK            out side has shutdown after remote has
 244 *                              shutdown.  There may still be data in our
 245 *                              buffer that we have to finish sending
 246 *
 247 *      TCP_CLOSE               socket is finished
 248 */
 249
 250#include <linux/kernel.h>
 251#include <linux/module.h>
 252#include <linux/types.h>
 253#include <linux/fcntl.h>
 254#include <linux/poll.h>
 255#include <linux/init.h>
 256#include <linux/fs.h>
 257#include <linux/skbuff.h>
 258#include <linux/splice.h>
 259#include <linux/net.h>
 260#include <linux/socket.h>
 261#include <linux/random.h>
 262#include <linux/bootmem.h>
 263#include <linux/cache.h>
 264#include <linux/err.h>
 265#include <linux/crypto.h>
 266
 267#include <net/icmp.h>
 268#include <net/tcp.h>
 269#include <net/xfrm.h>
 270#include <net/ip.h>
 271#include <net/netdma.h>
 272#include <net/sock.h>
 273
 274#include <asm/uaccess.h>
 275#include <asm/ioctls.h>
 276
 277int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
 278
 279DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
 280
 281atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 282
 283EXPORT_SYMBOL_GPL(tcp_orphan_count);
 284
 285int sysctl_tcp_mem[3] __read_mostly;
 286int sysctl_tcp_wmem[3] __read_mostly;
 287int sysctl_tcp_rmem[3] __read_mostly;
 288
 289EXPORT_SYMBOL(sysctl_tcp_mem);
 290EXPORT_SYMBOL(sysctl_tcp_rmem);
 291EXPORT_SYMBOL(sysctl_tcp_wmem);
 292
 293atomic_t tcp_memory_allocated;  /* Current allocated memory. */
 294atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
 295
 296EXPORT_SYMBOL(tcp_memory_allocated);
 297EXPORT_SYMBOL(tcp_sockets_allocated);
 298
 299/*
 300 * TCP splice context
 301 */
 302struct tcp_splice_state {
 303        struct pipe_inode_info *pipe;
 304        size_t len;
 305        unsigned int flags;
 306};
 307
 308/*
 309 * Pressure flag: try to collapse.
 310 * Technical note: it is used by multiple contexts non atomically.
 311 * All the __sk_mem_schedule() is of this nature: accounting
 312 * is strict, actions are advisory and have some latency.
 313 */
 314int tcp_memory_pressure __read_mostly;
 315
 316EXPORT_SYMBOL(tcp_memory_pressure);
 317
 318void tcp_enter_memory_pressure(void)
 319{
 320        if (!tcp_memory_pressure) {
 321                NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
 322                tcp_memory_pressure = 1;
 323        }
 324}
 325
 326EXPORT_SYMBOL(tcp_enter_memory_pressure);
 327
 328/*
 329 *      Wait for a TCP event.
 330 *
 331 *      Note that we don't need to lock the socket, as the upper poll layers
 332 *      take care of normal races (between the test and the event) and we don't
 333 *      go look at any of the socket buffers directly.
 334 */
 335unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 336{
 337        unsigned int mask;
 338        struct sock *sk = sock->sk;
 339        struct tcp_sock *tp = tcp_sk(sk);
 340
 341        poll_wait(file, sk->sk_sleep, wait);
 342        if (sk->sk_state == TCP_LISTEN)
 343                return inet_csk_listen_poll(sk);
 344
 345        /* Socket is not locked. We are protected from async events
 346           by poll logic and correct handling of state changes
 347           made by another threads is impossible in any case.
 348         */
 349
 350        mask = 0;
 351        if (sk->sk_err)
 352                mask = POLLERR;
 353
 354        /*
 355         * POLLHUP is certainly not done right. But poll() doesn't
 356         * have a notion of HUP in just one direction, and for a
 357         * socket the read side is more interesting.
 358         *
 359         * Some poll() documentation says that POLLHUP is incompatible
 360         * with the POLLOUT/POLLWR flags, so somebody should check this
 361         * all. But careful, it tends to be safer to return too many
 362         * bits than too few, and you can easily break real applications
 363         * if you don't tell them that something has hung up!
 364         *
 365         * Check-me.
 366         *
 367         * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
 368         * our fs/select.c). It means that after we received EOF,
 369         * poll always returns immediately, making impossible poll() on write()
 370         * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
 371         * if and only if shutdown has been made in both directions.
 372         * Actually, it is interesting to look how Solaris and DUX
 373         * solve this dilemma. I would prefer, if PULLHUP were maskable,
 374         * then we could set it on SND_SHUTDOWN. BTW examples given
 375         * in Stevens' books assume exactly this behaviour, it explains
 376         * why PULLHUP is incompatible with POLLOUT.    --ANK
 377         *
 378         * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
 379         * blocking on fresh not-connected or disconnected socket. --ANK
 380         */
 381        if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
 382                mask |= POLLHUP;
 383        if (sk->sk_shutdown & RCV_SHUTDOWN)
 384                mask |= POLLIN | POLLRDNORM | POLLRDHUP;
 385
 386        /* Connected? */
 387        if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 388                /* Potential race condition. If read of tp below will
 389                 * escape above sk->sk_state, we can be illegally awaken
 390                 * in SYN_* states. */
 391                if ((tp->rcv_nxt != tp->copied_seq) &&
 392                    (tp->urg_seq != tp->copied_seq ||
 393                     tp->rcv_nxt != tp->copied_seq + 1 ||
 394                     sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
 395                        mask |= POLLIN | POLLRDNORM;
 396
 397                if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
 398                        if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
 399                                mask |= POLLOUT | POLLWRNORM;
 400                        } else {  /* send SIGIO later */
 401                                set_bit(SOCK_ASYNC_NOSPACE,
 402                                        &sk->sk_socket->flags);
 403                                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 404
 405                                /* Race breaker. If space is freed after
 406                                 * wspace test but before the flags are set,
 407                                 * IO signal will be lost.
 408                                 */
 409                                if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
 410                                        mask |= POLLOUT | POLLWRNORM;
 411                        }
 412                }
 413
 414                if (tp->urg_data & TCP_URG_VALID)
 415                        mask |= POLLPRI;
 416        }
 417        return mask;
 418}
 419
 420int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 421{
 422        struct tcp_sock *tp = tcp_sk(sk);
 423        int answ;
 424
 425        switch (cmd) {
 426        case SIOCINQ:
 427                if (sk->sk_state == TCP_LISTEN)
 428                        return -EINVAL;
 429
 430                lock_sock(sk);
 431                if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 432                        answ = 0;
 433                else if (sock_flag(sk, SOCK_URGINLINE) ||
 434                         !tp->urg_data ||
 435                         before(tp->urg_seq, tp->copied_seq) ||
 436                         !before(tp->urg_seq, tp->rcv_nxt)) {
 437                        answ = tp->rcv_nxt - tp->copied_seq;
 438
 439                        /* Subtract 1, if FIN is in queue. */
 440                        if (answ && !skb_queue_empty(&sk->sk_receive_queue))
 441                                answ -=
 442                       tcp_hdr((struct sk_buff *)sk->sk_receive_queue.prev)->fin;
 443                } else
 444                        answ = tp->urg_seq - tp->copied_seq;
 445                release_sock(sk);
 446                break;
 447        case SIOCATMARK:
 448                answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 449                break;
 450        case SIOCOUTQ:
 451                if (sk->sk_state == TCP_LISTEN)
 452                        return -EINVAL;
 453
 454                if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 455                        answ = 0;
 456                else
 457                        answ = tp->write_seq - tp->snd_una;
 458                break;
 459        default:
 460                return -ENOIOCTLCMD;
 461        }
 462
 463        return put_user(answ, (int __user *)arg);
 464}
 465
 466static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
 467{
 468        TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 469        tp->pushed_seq = tp->write_seq;
 470}
 471
 472static inline int forced_push(struct tcp_sock *tp)
 473{
 474        return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
 475}
 476
 477static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
 478{
 479        struct tcp_sock *tp = tcp_sk(sk);
 480        struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 481
 482        skb->csum    = 0;
 483        tcb->seq     = tcb->end_seq = tp->write_seq;
 484        tcb->flags   = TCPCB_FLAG_ACK;
 485        tcb->sacked  = 0;
 486        skb_header_release(skb);
 487        tcp_add_write_queue_tail(sk, skb);
 488        sk->sk_wmem_queued += skb->truesize;
 489        sk_mem_charge(sk, skb->truesize);
 490        if (tp->nonagle & TCP_NAGLE_PUSH)
 491                tp->nonagle &= ~TCP_NAGLE_PUSH;
 492}
 493
 494static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
 495                                struct sk_buff *skb)
 496{
 497        if (flags & MSG_OOB) {
 498                tp->urg_mode = 1;
 499                tp->snd_up = tp->write_seq;
 500        }
 501}
 502
 503static inline void tcp_push(struct sock *sk, int flags, int mss_now,
 504                            int nonagle)
 505{
 506        struct tcp_sock *tp = tcp_sk(sk);
 507
 508        if (tcp_send_head(sk)) {
 509                struct sk_buff *skb = tcp_write_queue_tail(sk);
 510                if (!(flags & MSG_MORE) || forced_push(tp))
 511                        tcp_mark_push(tp, skb);
 512                tcp_mark_urg(tp, flags, skb);
 513                __tcp_push_pending_frames(sk, mss_now,
 514                                          (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
 515        }
 516}
 517
 518static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
 519                                unsigned int offset, size_t len)
 520{
 521        struct tcp_splice_state *tss = rd_desc->arg.data;
 522
 523        return skb_splice_bits(skb, offset, tss->pipe, tss->len, tss->flags);
 524}
 525
 526static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
 527{
 528        /* Store TCP splice context information in read_descriptor_t. */
 529        read_descriptor_t rd_desc = {
 530                .arg.data = tss,
 531        };
 532
 533        return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
 534}
 535
 536/**
 537 *  tcp_splice_read - splice data from TCP socket to a pipe
 538 * @sock:       socket to splice from
 539 * @ppos:       position (not valid)
 540 * @pipe:       pipe to splice to
 541 * @len:        number of bytes to splice
 542 * @flags:      splice modifier flags
 543 *
 544 * Description:
 545 *    Will read pages from given socket and fill them into a pipe.
 546 *
 547 **/
 548ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
 549                        struct pipe_inode_info *pipe, size_t len,
 550                        unsigned int flags)
 551{
 552        struct sock *sk = sock->sk;
 553        struct tcp_splice_state tss = {
 554                .pipe = pipe,
 555                .len = len,
 556                .flags = flags,
 557        };
 558        long timeo;
 559        ssize_t spliced;
 560        int ret;
 561
 562        /*
 563         * We can't seek on a socket input
 564         */
 565        if (unlikely(*ppos))
 566                return -ESPIPE;
 567
 568        ret = spliced = 0;
 569
 570        lock_sock(sk);
 571
 572        timeo = sock_rcvtimeo(sk, flags & SPLICE_F_NONBLOCK);
 573        while (tss.len) {
 574                ret = __tcp_splice_read(sk, &tss);
 575                if (ret < 0)
 576                        break;
 577                else if (!ret) {
 578                        if (spliced)
 579                                break;
 580                        if (flags & SPLICE_F_NONBLOCK) {
 581                                ret = -EAGAIN;
 582                                break;
 583                        }
 584                        if (sock_flag(sk, SOCK_DONE))
 585                                break;
 586                        if (sk->sk_err) {
 587                                ret = sock_error(sk);
 588                                break;
 589                        }
 590                        if (sk->sk_shutdown & RCV_SHUTDOWN)
 591                                break;
 592                        if (sk->sk_state == TCP_CLOSE) {
 593                                /*
 594                                 * This occurs when user tries to read
 595                                 * from never connected socket.
 596                                 */
 597                                if (!sock_flag(sk, SOCK_DONE))
 598                                        ret = -ENOTCONN;
 599                                break;
 600                        }
 601                        if (!timeo) {
 602                                ret = -EAGAIN;
 603                                break;
 604                        }
 605                        sk_wait_data(sk, &timeo);
 606                        if (signal_pending(current)) {
 607                                ret = sock_intr_errno(timeo);
 608                                break;
 609                        }
 610                        continue;
 611                }
 612                tss.len -= ret;
 613                spliced += ret;
 614
 615                release_sock(sk);
 616                lock_sock(sk);
 617
 618                if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
 619                    (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo ||
 620                    signal_pending(current))
 621                        break;
 622        }
 623
 624        release_sock(sk);
 625
 626        if (spliced)
 627                return spliced;
 628
 629        return ret;
 630}
 631
 632struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
 633{
 634        struct sk_buff *skb;
 635
 636        /* The TCP header must be at least 32-bit aligned.  */
 637        size = ALIGN(size, 4);
 638
 639        skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
 640        if (skb) {
 641                if (sk_wmem_schedule(sk, skb->truesize)) {
 642                        /*
 643                         * Make sure that we have exactly size bytes
 644                         * available to the caller, no more, no less.
 645                         */
 646                        skb_reserve(skb, skb_tailroom(skb) - size);
 647                        return skb;
 648                }
 649                __kfree_skb(skb);
 650        } else {
 651                sk->sk_prot->enter_memory_pressure();
 652                sk_stream_moderate_sndbuf(sk);
 653        }
 654        return NULL;
 655}
 656
 657static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
 658                         size_t psize, int flags)
 659{
 660        struct tcp_sock *tp = tcp_sk(sk);
 661        int mss_now, size_goal;
 662        int err;
 663        ssize_t copied;
 664        long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 665
 666        /* Wait for a connection to finish. */
 667        if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 668                if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 669                        goto out_err;
 670
 671        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 672
 673        mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 674        size_goal = tp->xmit_size_goal;
 675        copied = 0;
 676
 677        err = -EPIPE;
 678        if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 679                goto do_error;
 680
 681        while (psize > 0) {
 682                struct sk_buff *skb = tcp_write_queue_tail(sk);
 683                struct page *page = pages[poffset / PAGE_SIZE];
 684                int copy, i, can_coalesce;
 685                int offset = poffset % PAGE_SIZE;
 686                int size = min_t(size_t, psize, PAGE_SIZE - offset);
 687
 688                if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
 689new_segment:
 690                        if (!sk_stream_memory_free(sk))
 691                                goto wait_for_sndbuf;
 692
 693                        skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
 694                        if (!skb)
 695                                goto wait_for_memory;
 696
 697                        skb_entail(sk, skb);
 698                        copy = size_goal;
 699                }
 700
 701                if (copy > size)
 702                        copy = size;
 703
 704                i = skb_shinfo(skb)->nr_frags;
 705                can_coalesce = skb_can_coalesce(skb, i, page, offset);
 706                if (!can_coalesce && i >= MAX_SKB_FRAGS) {
 707                        tcp_mark_push(tp, skb);
 708                        goto new_segment;
 709                }
 710                if (!sk_wmem_schedule(sk, copy))
 711                        goto wait_for_memory;
 712
 713                if (can_coalesce) {
 714                        skb_shinfo(skb)->frags[i - 1].size += copy;
 715                } else {
 716                        get_page(page);
 717                        skb_fill_page_desc(skb, i, page, offset, copy);
 718                }
 719
 720                skb->len += copy;
 721                skb->data_len += copy;
 722                skb->truesize += copy;
 723                sk->sk_wmem_queued += copy;
 724                sk_mem_charge(sk, copy);
 725                skb->ip_summed = CHECKSUM_PARTIAL;
 726                tp->write_seq += copy;
 727                TCP_SKB_CB(skb)->end_seq += copy;
 728                skb_shinfo(skb)->gso_segs = 0;
 729
 730                if (!copied)
 731                        TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 732
 733                copied += copy;
 734                poffset += copy;
 735                if (!(psize -= copy))
 736                        goto out;
 737
 738                if (skb->len < size_goal || (flags & MSG_OOB))
 739                        continue;
 740
 741                if (forced_push(tp)) {
 742                        tcp_mark_push(tp, skb);
 743                        __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
 744                } else if (skb == tcp_send_head(sk))
 745                        tcp_push_one(sk, mss_now);
 746                continue;
 747
 748wait_for_sndbuf:
 749                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 750wait_for_memory:
 751                if (copied)
 752                        tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 753
 754                if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
 755                        goto do_error;
 756
 757                mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 758                size_goal = tp->xmit_size_goal;
 759        }
 760
 761out:
 762        if (copied)
 763                tcp_push(sk, flags, mss_now, tp->nonagle);
 764        return copied;
 765
 766do_error:
 767        if (copied)
 768                goto out;
 769out_err:
 770        return sk_stream_error(sk, flags, err);
 771}
 772
 773ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
 774                     size_t size, int flags)
 775{
 776        ssize_t res;
 777        struct sock *sk = sock->sk;
 778
 779        if (!(sk->sk_route_caps & NETIF_F_SG) ||
 780            !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
 781                return sock_no_sendpage(sock, page, offset, size, flags);
 782
 783        lock_sock(sk);
 784        TCP_CHECK_TIMER(sk);
 785        res = do_tcp_sendpages(sk, &page, offset, size, flags);
 786        TCP_CHECK_TIMER(sk);
 787        release_sock(sk);
 788        return res;
 789}
 790
 791#define TCP_PAGE(sk)    (sk->sk_sndmsg_page)
 792#define TCP_OFF(sk)     (sk->sk_sndmsg_off)
 793
 794static inline int select_size(struct sock *sk)
 795{
 796        struct tcp_sock *tp = tcp_sk(sk);
 797        int tmp = tp->mss_cache;
 798
 799        if (sk->sk_route_caps & NETIF_F_SG) {
 800                if (sk_can_gso(sk))
 801                        tmp = 0;
 802                else {
 803                        int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
 804
 805                        if (tmp >= pgbreak &&
 806                            tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
 807                                tmp = pgbreak;
 808                }
 809        }
 810
 811        return tmp;
 812}
 813
 814int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 815                size_t size)
 816{
 817        struct sock *sk = sock->sk;
 818        struct iovec *iov;
 819        struct tcp_sock *tp = tcp_sk(sk);
 820        struct sk_buff *skb;
 821        int iovlen, flags;
 822        int mss_now, size_goal;
 823        int err, copied;
 824        long timeo;
 825
 826        lock_sock(sk);
 827        TCP_CHECK_TIMER(sk);
 828
 829        flags = msg->msg_flags;
 830        timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 831
 832        /* Wait for a connection to finish. */
 833        if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 834                if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 835                        goto out_err;
 836
 837        /* This should be in poll */
 838        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 839
 840        mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 841        size_goal = tp->xmit_size_goal;
 842
 843        /* Ok commence sending. */
 844        iovlen = msg->msg_iovlen;
 845        iov = msg->msg_iov;
 846        copied = 0;
 847
 848        err = -EPIPE;
 849        if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 850                goto do_error;
 851
 852        while (--iovlen >= 0) {
 853                int seglen = iov->iov_len;
 854                unsigned char __user *from = iov->iov_base;
 855
 856                iov++;
 857
 858                while (seglen > 0) {
 859                        int copy;
 860
 861                        skb = tcp_write_queue_tail(sk);
 862
 863                        if (!tcp_send_head(sk) ||
 864                            (copy = size_goal - skb->len) <= 0) {
 865
 866new_segment:
 867                                /* Allocate new segment. If the interface is SG,
 868                                 * allocate skb fitting to single page.
 869                                 */
 870                                if (!sk_stream_memory_free(sk))
 871                                        goto wait_for_sndbuf;
 872
 873                                skb = sk_stream_alloc_skb(sk, select_size(sk),
 874                                                sk->sk_allocation);
 875                                if (!skb)
 876                                        goto wait_for_memory;
 877
 878                                /*
 879                                 * Check whether we can use HW checksum.
 880                                 */
 881                                if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
 882                                        skb->ip_summed = CHECKSUM_PARTIAL;
 883
 884                                skb_entail(sk, skb);
 885                                copy = size_goal;
 886                        }
 887
 888                        /* Try to append data to the end of skb. */
 889                        if (copy > seglen)
 890                                copy = seglen;
 891
 892                        /* Where to copy to? */
 893                        if (skb_tailroom(skb) > 0) {
 894                                /* We have some space in skb head. Superb! */
 895                                if (copy > skb_tailroom(skb))
 896                                        copy = skb_tailroom(skb);
 897                                if ((err = skb_add_data(skb, from, copy)) != 0)
 898                                        goto do_fault;
 899                        } else {
 900                                int merge = 0;
 901                                int i = skb_shinfo(skb)->nr_frags;
 902                                struct page *page = TCP_PAGE(sk);
 903                                int off = TCP_OFF(sk);
 904
 905                                if (skb_can_coalesce(skb, i, page, off) &&
 906                                    off != PAGE_SIZE) {
 907                                        /* We can extend the last page
 908                                         * fragment. */
 909                                        merge = 1;
 910                                } else if (i == MAX_SKB_FRAGS ||
 911                                           (!i &&
 912                                           !(sk->sk_route_caps & NETIF_F_SG))) {
 913                                        /* Need to add new fragment and cannot
 914                                         * do this because interface is non-SG,
 915                                         * or because all the page slots are
 916                                         * busy. */
 917                                        tcp_mark_push(tp, skb);
 918                                        goto new_segment;
 919                                } else if (page) {
 920                                        if (off == PAGE_SIZE) {
 921                                                put_page(page);
 922                                                TCP_PAGE(sk) = page = NULL;
 923                                                off = 0;
 924                                        }
 925                                } else
 926                                        off = 0;
 927
 928                                if (copy > PAGE_SIZE - off)
 929                                        copy = PAGE_SIZE - off;
 930
 931                                if (!sk_wmem_schedule(sk, copy))
 932                                        goto wait_for_memory;
 933
 934                                if (!page) {
 935                                        /* Allocate new cache page. */
 936                                        if (!(page = sk_stream_alloc_page(sk)))
 937                                                goto wait_for_memory;
 938                                }
 939
 940                                /* Time to copy data. We are close to
 941                                 * the end! */
 942                                err = skb_copy_to_page(sk, from, skb, page,
 943                                                       off, copy);
 944                                if (err) {
 945                                        /* If this page was new, give it to the
 946                                         * socket so it does not get leaked.
 947                                         */
 948                                        if (!TCP_PAGE(sk)) {
 949                                                TCP_PAGE(sk) = page;
 950                                                TCP_OFF(sk) = 0;
 951                                        }
 952                                        goto do_error;
 953                                }
 954
 955                                /* Update the skb. */
 956                                if (merge) {
 957                                        skb_shinfo(skb)->frags[i - 1].size +=
 958                                                                        copy;
 959                                } else {
 960                                        skb_fill_page_desc(skb, i, page, off, copy);
 961                                        if (TCP_PAGE(sk)) {
 962                                                get_page(page);
 963                                        } else if (off + copy < PAGE_SIZE) {
 964                                                get_page(page);
 965                                                TCP_PAGE(sk) = page;
 966                                        }
 967                                }
 968
 969                                TCP_OFF(sk) = off + copy;
 970                        }
 971
 972                        if (!copied)
 973                                TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 974
 975                        tp->write_seq += copy;
 976                        TCP_SKB_CB(skb)->end_seq += copy;
 977                        skb_shinfo(skb)->gso_segs = 0;
 978
 979                        from += copy;
 980                        copied += copy;
 981                        if ((seglen -= copy) == 0 && iovlen == 0)
 982                                goto out;
 983
 984                        if (skb->len < size_goal || (flags & MSG_OOB))
 985                                continue;
 986
 987                        if (forced_push(tp)) {
 988                                tcp_mark_push(tp, skb);
 989                                __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
 990                        } else if (skb == tcp_send_head(sk))
 991                                tcp_push_one(sk, mss_now);
 992                        continue;
 993
 994wait_for_sndbuf:
 995                        set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 996wait_for_memory:
 997                        if (copied)
 998                                tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 999
1000                        if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1001                                goto do_error;
1002
1003                        mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1004                        size_goal = tp->xmit_size_goal;
1005                }
1006        }
1007
1008out:
1009        if (copied)
1010                tcp_push(sk, flags, mss_now, tp->nonagle);
1011        TCP_CHECK_TIMER(sk);
1012        release_sock(sk);
1013        return copied;
1014
1015do_fault:
1016        if (!skb->len) {
1017                tcp_unlink_write_queue(skb, sk);
1018                /* It is the one place in all of TCP, except connection
1019                 * reset, where we can be unlinking the send_head.
1020                 */
1021                tcp_check_send_head(sk, skb);
1022                sk_wmem_free_skb(sk, skb);
1023        }
1024
1025do_error:
1026        if (copied)
1027                goto out;
1028out_err:
1029        err = sk_stream_error(sk, flags, err);
1030        TCP_CHECK_TIMER(sk);
1031        release_sock(sk);
1032        return err;
1033}
1034
1035/*
1036 *      Handle reading urgent data. BSD has very simple semantics for
1037 *      this, no blocking and very strange errors 8)
1038 */
1039
1040static int tcp_recv_urg(struct sock *sk, long timeo,
1041                        struct msghdr *msg, int len, int flags,
1042                        int *addr_len)
1043{
1044        struct tcp_sock *tp = tcp_sk(sk);
1045
1046        /* No URG data to read. */
1047        if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1048            tp->urg_data == TCP_URG_READ)
1049                return -EINVAL; /* Yes this is right ! */
1050
1051        if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1052                return -ENOTCONN;
1053
1054        if (tp->urg_data & TCP_URG_VALID) {
1055                int err = 0;
1056                char c = tp->urg_data;
1057
1058                if (!(flags & MSG_PEEK))
1059                        tp->urg_data = TCP_URG_READ;
1060
1061                /* Read urgent data. */
1062                msg->msg_flags |= MSG_OOB;
1063
1064                if (len > 0) {
1065                        if (!(flags & MSG_TRUNC))
1066                                err = memcpy_toiovec(msg->msg_iov, &c, 1);
1067                        len = 1;
1068                } else
1069                        msg->msg_flags |= MSG_TRUNC;
1070
1071                return err ? -EFAULT : len;
1072        }
1073
1074        if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1075                return 0;
1076
1077        /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1078         * the available implementations agree in this case:
1079         * this call should never block, independent of the
1080         * blocking state of the socket.
1081         * Mike <pall@rz.uni-karlsruhe.de>
1082         */
1083        return -EAGAIN;
1084}
1085
1086/* Clean up the receive buffer for full frames taken by the user,
1087 * then send an ACK if necessary.  COPIED is the number of bytes
1088 * tcp_recvmsg has given to the user so far, it speeds up the
1089 * calculation of whether or not we must ACK for the sake of
1090 * a window update.
1091 */
1092void tcp_cleanup_rbuf(struct sock *sk, int copied)
1093{
1094        struct tcp_sock *tp = tcp_sk(sk);
1095        int time_to_ack = 0;
1096
1097#if TCP_DEBUG
1098        struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1099
1100        BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1101#endif
1102
1103        if (inet_csk_ack_scheduled(sk)) {
1104                const struct inet_connection_sock *icsk = inet_csk(sk);
1105                   /* Delayed ACKs frequently hit locked sockets during bulk
1106                    * receive. */
1107                if (icsk->icsk_ack.blocked ||
1108                    /* Once-per-two-segments ACK was not sent by tcp_input.c */
1109                    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1110                    /*
1111                     * If this read emptied read buffer, we send ACK, if
1112                     * connection is not bidirectional, user drained
1113                     * receive buffer and there was a small segment
1114                     * in queue.
1115                     */
1116                    (copied > 0 &&
1117                     ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1118                      ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1119                       !icsk->icsk_ack.pingpong)) &&
1120                      !atomic_read(&sk->sk_rmem_alloc)))
1121                        time_to_ack = 1;
1122        }
1123
1124        /* We send an ACK if we can now advertise a non-zero window
1125         * which has been raised "significantly".
1126         *
1127         * Even if window raised up to infinity, do not send window open ACK
1128         * in states, where we will not receive more. It is useless.
1129         */
1130        if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1131                __u32 rcv_window_now = tcp_receive_window(tp);
1132
1133                /* Optimize, __tcp_select_window() is not cheap. */
1134                if (2*rcv_window_now <= tp->window_clamp) {
1135                        __u32 new_window = __tcp_select_window(sk);
1136
1137                        /* Send ACK now, if this read freed lots of space
1138                         * in our buffer. Certainly, new_window is new window.
1139                         * We can advertise it now, if it is not less than current one.
1140                         * "Lots" means "at least twice" here.
1141                         */
1142                        if (new_window && new_window >= 2 * rcv_window_now)
1143                                time_to_ack = 1;
1144                }
1145        }
1146        if (time_to_ack)
1147                tcp_send_ack(sk);
1148}
1149
1150static void tcp_prequeue_process(struct sock *sk)
1151{
1152        struct sk_buff *skb;
1153        struct tcp_sock *tp = tcp_sk(sk);
1154
1155        NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
1156
1157        /* RX process wants to run with disabled BHs, though it is not
1158         * necessary */
1159        local_bh_disable();
1160        while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1161                sk->sk_backlog_rcv(sk, skb);
1162        local_bh_enable();
1163
1164        /* Clear memory counter. */
1165        tp->ucopy.memory = 0;
1166}
1167
1168static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1169{
1170        struct sk_buff *skb;
1171        u32 offset;
1172
1173        skb_queue_walk(&sk->sk_receive_queue, skb) {
1174                offset = seq - TCP_SKB_CB(skb)->seq;
1175                if (tcp_hdr(skb)->syn)
1176                        offset--;
1177                if (offset < skb->len || tcp_hdr(skb)->fin) {
1178                        *off = offset;
1179                        return skb;
1180                }
1181        }
1182        return NULL;
1183}
1184
1185/*
1186 * This routine provides an alternative to tcp_recvmsg() for routines
1187 * that would like to handle copying from skbuffs directly in 'sendfile'
1188 * fashion.
1189 * Note:
1190 *      - It is assumed that the socket was locked by the caller.
1191 *      - The routine does not block.
1192 *      - At present, there is no support for reading OOB data
1193 *        or for 'peeking' the socket using this routine
1194 *        (although both would be easy to implement).
1195 */
1196int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1197                  sk_read_actor_t recv_actor)
1198{
1199        struct sk_buff *skb;
1200        struct tcp_sock *tp = tcp_sk(sk);
1201        u32 seq = tp->copied_seq;
1202        u32 offset;
1203        int copied = 0;
1204
1205        if (sk->sk_state == TCP_LISTEN)
1206                return -ENOTCONN;
1207        while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1208                if (offset < skb->len) {
1209                        size_t used, len;
1210
1211                        len = skb->len - offset;
1212                        /* Stop reading if we hit a patch of urgent data */
1213                        if (tp->urg_data) {
1214                                u32 urg_offset = tp->urg_seq - seq;
1215                                if (urg_offset < len)
1216                                        len = urg_offset;
1217                                if (!len)
1218                                        break;
1219                        }
1220                        used = recv_actor(desc, skb, offset, len);
1221                        if (used < 0) {
1222                                if (!copied)
1223                                        copied = used;
1224                                break;
1225                        } else if (used <= len) {
1226                                seq += used;
1227                                copied += used;
1228                                offset += used;
1229                        }
1230                        if (offset != skb->len)
1231                                break;
1232                }
1233                if (tcp_hdr(skb)->fin) {
1234                        sk_eat_skb(sk, skb, 0);
1235                        ++seq;
1236                        break;
1237                }
1238                sk_eat_skb(sk, skb, 0);
1239                if (!desc->count)
1240                        break;
1241        }
1242        tp->copied_seq = seq;
1243
1244        tcp_rcv_space_adjust(sk);
1245
1246        /* Clean up data we have read: This will do ACK frames. */
1247        if (copied > 0)
1248                tcp_cleanup_rbuf(sk, copied);
1249        return copied;
1250}
1251
1252/*
1253 *      This routine copies from a sock struct into the user buffer.
1254 *
1255 *      Technical note: in 2.3 we work on _locked_ socket, so that
1256 *      tricks with *seq access order and skb->users are not required.
1257 *      Probably, code can be easily improved even more.
1258 */
1259
1260int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1261                size_t len, int nonblock, int flags, int *addr_len)
1262{
1263        struct tcp_sock *tp = tcp_sk(sk);
1264        int copied = 0;
1265        u32 peek_seq;
1266        u32 *seq;
1267        unsigned long used;
1268        int err;
1269        int target;             /* Read at least this many bytes */
1270        long timeo;
1271        struct task_struct *user_recv = NULL;
1272        int copied_early = 0;
1273        struct sk_buff *skb;
1274
1275        lock_sock(sk);
1276
1277        TCP_CHECK_TIMER(sk);
1278
1279        err = -ENOTCONN;
1280        if (sk->sk_state == TCP_LISTEN)
1281                goto out;
1282
1283        timeo = sock_rcvtimeo(sk, nonblock);
1284
1285        /* Urgent data needs to be handled specially. */
1286        if (flags & MSG_OOB)
1287                goto recv_urg;
1288
1289        seq = &tp->copied_seq;
1290        if (flags & MSG_PEEK) {
1291                peek_seq = tp->copied_seq;
1292                seq = &peek_seq;
1293        }
1294
1295        target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1296
1297#ifdef CONFIG_NET_DMA
1298        tp->ucopy.dma_chan = NULL;
1299        preempt_disable();
1300        skb = skb_peek_tail(&sk->sk_receive_queue);
1301        {
1302                int available = 0;
1303
1304                if (skb)
1305                        available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
1306                if ((available < target) &&
1307                    (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1308                    !sysctl_tcp_low_latency &&
1309                    __get_cpu_var(softnet_data).net_dma) {
1310                        preempt_enable_no_resched();
1311                        tp->ucopy.pinned_list =
1312                                        dma_pin_iovec_pages(msg->msg_iov, len);
1313                } else {
1314                        preempt_enable_no_resched();
1315                }
1316        }
1317#endif
1318
1319        do {
1320                u32 offset;
1321
1322                /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1323                if (tp->urg_data && tp->urg_seq == *seq) {
1324                        if (copied)
1325                                break;
1326                        if (signal_pending(current)) {
1327                                copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1328                                break;
1329                        }
1330                }
1331
1332                /* Next get a buffer. */
1333
1334                skb = skb_peek(&sk->sk_receive_queue);
1335                do {
1336                        if (!skb)
1337                                break;
1338
1339                        /* Now that we have two receive queues this
1340                         * shouldn't happen.
1341                         */
1342                        if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1343                                printk(KERN_INFO "recvmsg bug: copied %X "
1344                                       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1345                                break;
1346                        }
1347                        offset = *seq - TCP_SKB_CB(skb)->seq;
1348                        if (tcp_hdr(skb)->syn)
1349                                offset--;
1350                        if (offset < skb->len)
1351                                goto found_ok_skb;
1352                        if (tcp_hdr(skb)->fin)
1353                                goto found_fin_ok;
1354                        BUG_TRAP(flags & MSG_PEEK);
1355                        skb = skb->next;
1356                } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1357
1358                /* Well, if we have backlog, try to process it now yet. */
1359
1360                if (copied >= target && !sk->sk_backlog.tail)
1361                        break;
1362
1363                if (copied) {
1364                        if (sk->sk_err ||
1365                            sk->sk_state == TCP_CLOSE ||
1366                            (sk->sk_shutdown & RCV_SHUTDOWN) ||
1367                            !timeo ||
1368                            signal_pending(current) ||
1369                            (flags & MSG_PEEK))
1370                                break;
1371                } else {
1372                        if (sock_flag(sk, SOCK_DONE))
1373                                break;
1374
1375                        if (sk->sk_err) {
1376                                copied = sock_error(sk);
1377                                break;
1378                        }
1379
1380                        if (sk->sk_shutdown & RCV_SHUTDOWN)
1381                                break;
1382
1383                        if (sk->sk_state == TCP_CLOSE) {
1384                                if (!sock_flag(sk, SOCK_DONE)) {
1385                                        /* This occurs when user tries to read
1386                                         * from never connected socket.
1387                                         */
1388                                        copied = -ENOTCONN;
1389                                        break;
1390                                }
1391                                break;
1392                        }
1393
1394                        if (!timeo) {
1395                                copied = -EAGAIN;
1396                                break;
1397                        }
1398
1399                        if (signal_pending(current)) {
1400                                copied = sock_intr_errno(timeo);
1401                                break;
1402                        }
1403                }
1404
1405                tcp_cleanup_rbuf(sk, copied);
1406
1407                if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1408                        /* Install new reader */
1409                        if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1410                                user_recv = current;
1411                                tp->ucopy.task = user_recv;
1412                                tp->ucopy.iov = msg->msg_iov;
1413                        }
1414
1415                        tp->ucopy.len = len;
1416
1417                        BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1418                                 (flags & (MSG_PEEK | MSG_TRUNC)));
1419
1420                        /* Ugly... If prequeue is not empty, we have to
1421                         * process it before releasing socket, otherwise
1422                         * order will be broken at second iteration.
1423                         * More elegant solution is required!!!
1424                         *
1425                         * Look: we have the following (pseudo)queues:
1426                         *
1427                         * 1. packets in flight
1428                         * 2. backlog
1429                         * 3. prequeue
1430                         * 4. receive_queue
1431                         *
1432                         * Each queue can be processed only if the next ones
1433                         * are empty. At this point we have empty receive_queue.
1434                         * But prequeue _can_ be not empty after 2nd iteration,
1435                         * when we jumped to start of loop because backlog
1436                         * processing added something to receive_queue.
1437                         * We cannot release_sock(), because backlog contains
1438                         * packets arrived _after_ prequeued ones.
1439                         *
1440                         * Shortly, algorithm is clear --- to process all
1441                         * the queues in order. We could make it more directly,
1442                         * requeueing packets from backlog to prequeue, if
1443                         * is not empty. It is more elegant, but eats cycles,
1444                         * unfortunately.
1445                         */
1446                        if (!skb_queue_empty(&tp->ucopy.prequeue))
1447                                goto do_prequeue;
1448
1449                        /* __ Set realtime policy in scheduler __ */
1450                }
1451
1452                if (copied >= target) {
1453                        /* Do not sleep, just process backlog. */
1454                        release_sock(sk);
1455                        lock_sock(sk);
1456                } else
1457                        sk_wait_data(sk, &timeo);
1458
1459#ifdef CONFIG_NET_DMA
1460                tp->ucopy.wakeup = 0;
1461#endif
1462
1463                if (user_recv) {
1464                        int chunk;
1465
1466                        /* __ Restore normal policy in scheduler __ */
1467
1468                        if ((chunk = len - tp->ucopy.len) != 0) {
1469                                NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1470                                len -= chunk;
1471                                copied += chunk;
1472                        }
1473
1474                        if (tp->rcv_nxt == tp->copied_seq &&
1475                            !skb_queue_empty(&tp->ucopy.prequeue)) {
1476do_prequeue:
1477                                tcp_prequeue_process(sk);
1478
1479                                if ((chunk = len - tp->ucopy.len) != 0) {
1480                                        NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1481                                        len -= chunk;
1482                                        copied += chunk;
1483                                }
1484                        }
1485                }
1486                if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1487                        if (net_ratelimit())
1488                                printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1489                                       current->comm, task_pid_nr(current));
1490                        peek_seq = tp->copied_seq;
1491                }
1492                continue;
1493
1494        found_ok_skb:
1495                /* Ok so how much can we use? */
1496                used = skb->len - offset;
1497                if (len < used)
1498                        used = len;
1499
1500                /* Do we have urgent data here? */
1501                if (tp->urg_data) {
1502                        u32 urg_offset = tp->urg_seq - *seq;
1503                        if (urg_offset < used) {
1504                                if (!urg_offset) {
1505                                        if (!sock_flag(sk, SOCK_URGINLINE)) {
1506                                                ++*seq;
1507                                                offset++;
1508                                                used--;
1509                                                if (!used)
1510                                                        goto skip_copy;
1511                                        }
1512                                } else
1513                                        used = urg_offset;
1514                        }
1515                }
1516
1517                if (!(flags & MSG_TRUNC)) {
1518#ifdef CONFIG_NET_DMA
1519                        if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1520                                tp->ucopy.dma_chan = get_softnet_dma();
1521
1522                        if (tp->ucopy.dma_chan) {
1523                                tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
1524                                        tp->ucopy.dma_chan, skb, offset,
1525                                        msg->msg_iov, used,
1526                                        tp->ucopy.pinned_list);
1527
1528                                if (tp->ucopy.dma_cookie < 0) {
1529
1530                                        printk(KERN_ALERT "dma_cookie < 0\n");
1531
1532                                        /* Exception. Bailout! */
1533                                        if (!copied)
1534                                                copied = -EFAULT;
1535                                        break;
1536                                }
1537                                if ((offset + used) == skb->len)
1538                                        copied_early = 1;
1539
1540                        } else
1541#endif
1542                        {
1543                                err = skb_copy_datagram_iovec(skb, offset,
1544                                                msg->msg_iov, used);
1545                                if (err) {
1546                                        /* Exception. Bailout! */
1547                                        if (!copied)
1548                                                copied = -EFAULT;
1549                                        break;
1550                                }
1551                        }
1552                }
1553
1554                *seq += used;
1555                copied += used;
1556                len -= used;
1557
1558                tcp_rcv_space_adjust(sk);
1559
1560skip_copy:
1561                if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1562                        tp->urg_data = 0;
1563                        tcp_fast_path_check(sk);
1564                }
1565                if (used + offset < skb->len)
1566                        continue;
1567
1568                if (tcp_hdr(skb)->fin)
1569                        goto found_fin_ok;
1570                if (!(flags & MSG_PEEK)) {
1571                        sk_eat_skb(sk, skb, copied_early);
1572                        copied_early = 0;
1573                }
1574                continue;
1575
1576        found_fin_ok:
1577                /* Process the FIN. */
1578                ++*seq;
1579                if (!(flags & MSG_PEEK)) {
1580                        sk_eat_skb(sk, skb, copied_early);
1581                        copied_early = 0;
1582                }
1583                break;
1584        } while (len > 0);
1585
1586        if (user_recv) {
1587                if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1588                        int chunk;
1589
1590                        tp->ucopy.len = copied > 0 ? len : 0;
1591
1592                        tcp_prequeue_process(sk);
1593
1594                        if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1595                                NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1596                                len -= chunk;
1597                                copied += chunk;
1598                        }
1599                }
1600
1601                tp->ucopy.task = NULL;
1602                tp->ucopy.len = 0;
1603        }
1604
1605#ifdef CONFIG_NET_DMA
1606        if (tp->ucopy.dma_chan) {
1607                dma_cookie_t done, used;
1608
1609                dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1610
1611                while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1612                                                 tp->ucopy.dma_cookie, &done,
1613                                                 &used) == DMA_IN_PROGRESS) {
1614                        /* do partial cleanup of sk_async_wait_queue */
1615                        while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1616                               (dma_async_is_complete(skb->dma_cookie, done,
1617                                                      used) == DMA_SUCCESS)) {
1618                                __skb_dequeue(&sk->sk_async_wait_queue);
1619                                kfree_skb(skb);
1620                        }
1621                }
1622
1623                /* Safe to free early-copied skbs now */
1624                __skb_queue_purge(&sk->sk_async_wait_queue);
1625                dma_chan_put(tp->ucopy.dma_chan);
1626                tp->ucopy.dma_chan = NULL;
1627        }
1628        if (tp->ucopy.pinned_list) {
1629                dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1630                tp->ucopy.pinned_list = NULL;
1631        }
1632#endif
1633
1634        /* According to UNIX98, msg_name/msg_namelen are ignored
1635         * on connected socket. I was just happy when found this 8) --ANK
1636         */
1637
1638        /* Clean up data we have read: This will do ACK frames. */
1639        tcp_cleanup_rbuf(sk, copied);
1640
1641        TCP_CHECK_TIMER(sk);
1642        release_sock(sk);
1643        return copied;
1644
1645out:
1646        TCP_CHECK_TIMER(sk);
1647        release_sock(sk);
1648        return err;
1649
1650recv_urg:
1651        err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1652        goto out;
1653}
1654
1655void tcp_set_state(struct sock *sk, int state)
1656{
1657        int oldstate = sk->sk_state;
1658
1659        switch (state) {
1660        case TCP_ESTABLISHED:
1661                if (oldstate != TCP_ESTABLISHED)
1662                        TCP_INC_STATS(TCP_MIB_CURRESTAB);
1663                break;
1664
1665        case TCP_CLOSE:
1666                if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
1667                        TCP_INC_STATS(TCP_MIB_ESTABRESETS);
1668
1669                sk->sk_prot->unhash(sk);
1670                if (inet_csk(sk)->icsk_bind_hash &&
1671                    !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
1672                        inet_put_port(sk);
1673                /* fall through */
1674        default:
1675                if (oldstate==TCP_ESTABLISHED)
1676                        TCP_DEC_STATS(TCP_MIB_CURRESTAB);
1677        }
1678
1679        /* Change state AFTER socket is unhashed to avoid closed
1680         * socket sitting in hash tables.
1681         */
1682        sk->sk_state = state;
1683
1684#ifdef STATE_TRACE
1685        SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n",sk, statename[oldstate],statename[state]);
1686#endif
1687}
1688EXPORT_SYMBOL_GPL(tcp_set_state);
1689
1690/*
1691 *      State processing on a close. This implements the state shift for
1692 *      sending our FIN frame. Note that we only send a FIN for some
1693 *      states. A shutdown() may have already sent the FIN, or we may be
1694 *      closed.
1695 */
1696
1697static const unsigned char new_state[16] = {
1698  /* current state:        new state:      action:      */
1699  /* (Invalid)          */ TCP_CLOSE,
1700  /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1701  /* TCP_SYN_SENT       */ TCP_CLOSE,
1702  /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1703  /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1704  /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1705  /* TCP_TIME_WAIT      */ TCP_CLOSE,
1706  /* TCP_CLOSE          */ TCP_CLOSE,
1707  /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1708  /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1709  /* TCP_LISTEN         */ TCP_CLOSE,
1710  /* TCP_CLOSING        */ TCP_CLOSING,
1711};
1712
1713static int tcp_close_state(struct sock *sk)
1714{
1715        int next = (int)new_state[sk->sk_state];
1716        int ns = next & TCP_STATE_MASK;
1717
1718        tcp_set_state(sk, ns);
1719
1720        return next & TCP_ACTION_FIN;
1721}
1722
1723/*
1724 *      Shutdown the sending side of a connection. Much like close except
1725 *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1726 */
1727
1728void tcp_shutdown(struct sock *sk, int how)
1729{
1730        /*      We need to grab some memory, and put together a FIN,
1731         *      and then put it into the queue to be sent.
1732         *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1733         */
1734        if (!(how & SEND_SHUTDOWN))
1735                return;
1736
1737        /* If we've already sent a FIN, or it's a closed state, skip this. */
1738        if ((1 << sk->sk_state) &
1739            (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1740             TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1741                /* Clear out any half completed packets.  FIN if needed. */
1742                if (tcp_close_state(sk))
1743                        tcp_send_fin(sk);
1744        }
1745}
1746
1747void tcp_close(struct sock *sk, long timeout)
1748{
1749        struct sk_buff *skb;
1750        int data_was_unread = 0;
1751        int state;
1752
1753        lock_sock(sk);
1754        sk->sk_shutdown = SHUTDOWN_MASK;
1755
1756        if (sk->sk_state == TCP_LISTEN) {
1757                tcp_set_state(sk, TCP_CLOSE);
1758
1759                /* Special case. */
1760                inet_csk_listen_stop(sk);
1761
1762                goto adjudge_to_death;
1763        }
1764
1765        /*  We need to flush the recv. buffs.  We do this only on the
1766         *  descriptor close, not protocol-sourced closes, because the
1767         *  reader process may not have drained the data yet!
1768         */
1769        while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1770                u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1771                          tcp_hdr(skb)->fin;
1772                data_was_unread += len;
1773                __kfree_skb(skb);
1774        }
1775
1776        sk_mem_reclaim(sk);
1777
1778        /* As outlined in RFC 2525, section 2.17, we send a RST here because
1779         * data was lost. To witness the awful effects of the old behavior of
1780         * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
1781         * GET in an FTP client, suspend the process, wait for the client to
1782         * advertise a zero window, then kill -9 the FTP client, wheee...
1783         * Note: timeout is always zero in such a case.
1784         */
1785        if (data_was_unread) {
1786                /* Unread data was tossed, zap the connection. */
1787                NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1788                tcp_set_state(sk, TCP_CLOSE);
1789                tcp_send_active_reset(sk, GFP_KERNEL);
1790        } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1791                /* Check zero linger _after_ checking for unread data. */
1792                sk->sk_prot->disconnect(sk, 0);
1793                NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1794        } else if (tcp_close_state(sk)) {
1795                /* We FIN if the application ate all the data before
1796                 * zapping the connection.
1797                 */
1798
1799                /* RED-PEN. Formally speaking, we have broken TCP state
1800                 * machine. State transitions:
1801                 *
1802                 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1803                 * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1804                 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1805                 *
1806                 * are legal only when FIN has been sent (i.e. in window),
1807                 * rather than queued out of window. Purists blame.
1808                 *
1809                 * F.e. "RFC state" is ESTABLISHED,
1810                 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1811                 *
1812                 * The visible declinations are that sometimes
1813                 * we enter time-wait state, when it is not required really
1814                 * (harmless), do not send active resets, when they are
1815                 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1816                 * they look as CLOSING or LAST_ACK for Linux)
1817                 * Probably, I missed some more holelets.
1818                 *                                              --ANK
1819                 */
1820                tcp_send_fin(sk);
1821        }
1822
1823        sk_stream_wait_close(sk, timeout);
1824
1825adjudge_to_death:
1826        state = sk->sk_state;
1827        sock_hold(sk);
1828        sock_orphan(sk);
1829        atomic_inc(sk->sk_prot->orphan_count);
1830
1831        /* It is the last release_sock in its life. It will remove backlog. */
1832        release_sock(sk);
1833
1834
1835        /* Now socket is owned by kernel and we acquire BH lock
1836           to finish close. No need to check for user refs.
1837         */
1838        local_bh_disable();
1839        bh_lock_sock(sk);
1840        BUG_TRAP(!sock_owned_by_user(sk));
1841
1842        /* Have we already been destroyed by a softirq or backlog? */
1843        if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
1844                goto out;
1845
1846        /*      This is a (useful) BSD violating of the RFC. There is a
1847         *      problem with TCP as specified in that the other end could
1848         *      keep a socket open forever with no application left this end.
1849         *      We use a 3 minute timeout (about the same as BSD) then kill
1850         *      our end. If they send after that then tough - BUT: long enough
1851         *      that we won't make the old 4*rto = almost no time - whoops
1852         *      reset mistake.
1853         *
1854         *      Nope, it was not mistake. It is really desired behaviour
1855         *      f.e. on http servers, when such sockets are useless, but
1856         *      consume significant resources. Let's do it with special
1857         *      linger2 option.                                 --ANK
1858         */
1859
1860        if (sk->sk_state == TCP_FIN_WAIT2) {
1861                struct tcp_sock *tp = tcp_sk(sk);
1862                if (tp->linger2 < 0) {
1863                        tcp_set_state(sk, TCP_CLOSE);
1864                        tcp_send_active_reset(sk, GFP_ATOMIC);
1865                        NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1866                } else {
1867                        const int tmo = tcp_fin_time(sk);
1868
1869                        if (tmo > TCP_TIMEWAIT_LEN) {
1870                                inet_csk_reset_keepalive_timer(sk,
1871                                                tmo - TCP_TIMEWAIT_LEN);
1872                        } else {
1873                                tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1874                                goto out;
1875                        }
1876                }
1877        }
1878        if (sk->sk_state != TCP_CLOSE) {
1879                sk_mem_reclaim(sk);
1880                if (tcp_too_many_orphans(sk,
1881                                atomic_read(sk->sk_prot->orphan_count))) {
1882                        if (net_ratelimit())
1883                                printk(KERN_INFO "TCP: too many of orphaned "
1884                                       "sockets\n");
1885                        tcp_set_state(sk, TCP_CLOSE);
1886                        tcp_send_active_reset(sk, GFP_ATOMIC);
1887                        NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1888                }
1889        }
1890
1891        if (sk->sk_state == TCP_CLOSE)
1892                inet_csk_destroy_sock(sk);
1893        /* Otherwise, socket is reprieved until protocol close. */
1894
1895out:
1896        bh_unlock_sock(sk);
1897        local_bh_enable();
1898        sock_put(sk);
1899}
1900
1901/* These states need RST on ABORT according to RFC793 */
1902
1903static inline int tcp_need_reset(int state)
1904{
1905        return (1 << state) &
1906               (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1907                TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1908}
1909
1910int tcp_disconnect(struct sock *sk, int flags)
1911{
1912        struct inet_sock *inet = inet_sk(sk);
1913        struct inet_connection_sock *icsk = inet_csk(sk);
1914        struct tcp_sock *tp = tcp_sk(sk);
1915        int err = 0;
1916        int old_state = sk->sk_state;
1917
1918        if (old_state != TCP_CLOSE)
1919                tcp_set_state(sk, TCP_CLOSE);
1920
1921        /* ABORT function of RFC793 */
1922        if (old_state == TCP_LISTEN) {
1923                inet_csk_listen_stop(sk);
1924        } else if (tcp_need_reset(old_state) ||
1925                   (tp->snd_nxt != tp->write_seq &&
1926                    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1927                /* The last check adjusts for discrepancy of Linux wrt. RFC
1928                 * states
1929                 */
1930                tcp_send_active_reset(sk, gfp_any());
1931                sk->sk_err = ECONNRESET;
1932        } else if (old_state == TCP_SYN_SENT)
1933                sk->sk_err = ECONNRESET;
1934
1935        tcp_clear_xmit_timers(sk);
1936        __skb_queue_purge(&sk->sk_receive_queue);
1937        tcp_write_queue_purge(sk);
1938        __skb_queue_purge(&tp->out_of_order_queue);
1939#ifdef CONFIG_NET_DMA
1940        __skb_queue_purge(&sk->sk_async_wait_queue);
1941#endif
1942
1943        inet->dport = 0;
1944
1945        if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1946                inet_reset_saddr(sk);
1947
1948        sk->sk_shutdown = 0;
1949        sock_reset_flag(sk, SOCK_DONE);
1950        tp->srtt = 0;
1951        if ((tp->write_seq += tp->max_window + 2) == 0)
1952                tp->write_seq = 1;
1953        icsk->icsk_backoff = 0;
1954        tp->snd_cwnd = 2;
1955        icsk->icsk_probes_out = 0;
1956        tp->packets_out = 0;
1957        tp->snd_ssthresh = 0x7fffffff;
1958        tp->snd_cwnd_cnt = 0;
1959        tp->bytes_acked = 0;
1960        tcp_set_ca_state(sk, TCP_CA_Open);
1961        tcp_clear_retrans(tp);
1962        inet_csk_delack_init(sk);
1963        tcp_init_send_head(sk);
1964        memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
1965        __sk_dst_reset(sk);
1966
1967        BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
1968
1969        sk->sk_error_report(sk);
1970        return err;
1971}
1972
1973/*
1974 *      Socket option code for TCP.
1975 */
1976static int do_tcp_setsockopt(struct sock *sk, int level,
1977                int optname, char __user *optval, int optlen)
1978{
1979        struct tcp_sock *tp = tcp_sk(sk);
1980        struct inet_connection_sock *icsk = inet_csk(sk);
1981        int val;
1982        int err = 0;
1983
1984        /* This is a string value all the others are int's */
1985        if (optname == TCP_CONGESTION) {
1986                char name[TCP_CA_NAME_MAX];
1987
1988                if (optlen < 1)
1989                        return -EINVAL;
1990
1991                val = strncpy_from_user(name, optval,
1992                                        min(TCP_CA_NAME_MAX-1, optlen));
1993                if (val < 0)
1994                        return -EFAULT;
1995                name[val] = 0;
1996
1997                lock_sock(sk);
1998                err = tcp_set_congestion_control(sk, name);
1999                release_sock(sk);
2000                return err;
2001        }
2002
2003        if (optlen < sizeof(int))
2004                return -EINVAL;
2005
2006        if (get_user(val, (int __user *)optval))
2007                return -EFAULT;
2008
2009        lock_sock(sk);
2010
2011        switch (optname) {
2012        case TCP_MAXSEG:
2013                /* Values greater than interface MTU won't take effect. However
2014                 * at the point when this call is done we typically don't yet
2015                 * know which interface is going to be used */
2016                if (val < 8 || val > MAX_TCP_WINDOW) {
2017                        err = -EINVAL;
2018                        break;
2019                }
2020                tp->rx_opt.user_mss = val;
2021                break;
2022
2023        case TCP_NODELAY:
2024                if (val) {
2025                        /* TCP_NODELAY is weaker than TCP_CORK, so that
2026                         * this option on corked socket is remembered, but
2027                         * it is not activated until cork is cleared.
2028                         *
2029                         * However, when TCP_NODELAY is set we make
2030                         * an explicit push, which overrides even TCP_CORK
2031                         * for currently queued segments.
2032                         */
2033                        tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2034                        tcp_push_pending_frames(sk);
2035                } else {
2036                        tp->nonagle &= ~TCP_NAGLE_OFF;
2037                }
2038                break;
2039
2040        case TCP_CORK:
2041                /* When set indicates to always queue non-full frames.
2042                 * Later the user clears this option and we transmit
2043                 * any pending partial frames in the queue.  This is
2044                 * meant to be used alongside sendfile() to get properly
2045                 * filled frames when the user (for example) must write
2046                 * out headers with a write() call first and then use
2047                 * sendfile to send out the data parts.
2048                 *
2049                 * TCP_CORK can be set together with TCP_NODELAY and it is
2050                 * stronger than TCP_NODELAY.
2051                 */
2052                if (val) {
2053                        tp->nonagle |= TCP_NAGLE_CORK;
2054                } else {
2055                        tp->nonagle &= ~TCP_NAGLE_CORK;
2056                        if (tp->nonagle&TCP_NAGLE_OFF)
2057                                tp->nonagle |= TCP_NAGLE_PUSH;
2058                        tcp_push_pending_frames(sk);
2059                }
2060                break;
2061
2062        case TCP_KEEPIDLE:
2063                if (val < 1 || val > MAX_TCP_KEEPIDLE)
2064                        err = -EINVAL;
2065                else {
2066                        tp->keepalive_time = val * HZ;
2067                        if (sock_flag(sk, SOCK_KEEPOPEN) &&
2068                            !((1 << sk->sk_state) &
2069                              (TCPF_CLOSE | TCPF_LISTEN))) {
2070                                __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2071                                if (tp->keepalive_time > elapsed)
2072                                        elapsed = tp->keepalive_time - elapsed;
2073                                else
2074                                        elapsed = 0;
2075                                inet_csk_reset_keepalive_timer(sk, elapsed);
2076                        }
2077                }
2078                break;
2079        case TCP_KEEPINTVL:
2080                if (val < 1 || val > MAX_TCP_KEEPINTVL)
2081                        err = -EINVAL;
2082                else
2083                        tp->keepalive_intvl = val * HZ;
2084                break;
2085        case TCP_KEEPCNT:
2086                if (val < 1 || val > MAX_TCP_KEEPCNT)
2087                        err = -EINVAL;
2088                else
2089                        tp->keepalive_probes = val;
2090                break;
2091        case TCP_SYNCNT:
2092                if (val < 1 || val > MAX_TCP_SYNCNT)
2093                        err = -EINVAL;
2094                else
2095                        icsk->icsk_syn_retries = val;
2096                break;
2097
2098        case TCP_LINGER2:
2099                if (val < 0)
2100                        tp->linger2 = -1;
2101                else if (val > sysctl_tcp_fin_timeout / HZ)
2102                        tp->linger2 = 0;
2103                else
2104                        tp->linger2 = val * HZ;
2105                break;
2106
2107        case TCP_DEFER_ACCEPT:
2108                icsk->icsk_accept_queue.rskq_defer_accept = 0;
2109                if (val > 0) {
2110                        /* Translate value in seconds to number of
2111                         * retransmits */
2112                        while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
2113                               val > ((TCP_TIMEOUT_INIT / HZ) <<
2114                                       icsk->icsk_accept_queue.rskq_defer_accept))
2115                                icsk->icsk_accept_queue.rskq_defer_accept++;
2116                        icsk->icsk_accept_queue.rskq_defer_accept++;
2117                }
2118                break;
2119
2120        case TCP_WINDOW_CLAMP:
2121                if (!val) {
2122                        if (sk->sk_state != TCP_CLOSE) {
2123                                err = -EINVAL;
2124                                break;
2125                        }
2126                        tp->window_clamp = 0;
2127                } else
2128                        tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2129                                                SOCK_MIN_RCVBUF / 2 : val;
2130                break;
2131
2132        case TCP_QUICKACK:
2133                if (!val) {
2134                        icsk->icsk_ack.pingpong = 1;
2135                } else {
2136                        icsk->icsk_ack.pingpong = 0;
2137                        if ((1 << sk->sk_state) &
2138                            (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2139                            inet_csk_ack_scheduled(sk)) {
2140                                icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
2141                                tcp_cleanup_rbuf(sk, 1);
2142                                if (!(val & 1))
2143                                        icsk->icsk_ack.pingpong = 1;
2144                        }
2145                }
2146                break;
2147
2148#ifdef CONFIG_TCP_MD5SIG
2149        case TCP_MD5SIG:
2150                /* Read the IP->Key mappings from userspace */
2151                err = tp->af_specific->md5_parse(sk, optval, optlen);
2152                break;
2153#endif
2154
2155        default:
2156                err = -ENOPROTOOPT;
2157                break;
2158        }
2159
2160        release_sock(sk);
2161        return err;
2162}
2163
2164int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2165                   int optlen)
2166{
2167        struct inet_connection_sock *icsk = inet_csk(sk);
2168
2169        if (level != SOL_TCP)
2170                return icsk->icsk_af_ops->setsockopt(sk, level, optname,
2171                                                     optval, optlen);
2172        return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2173}
2174
2175#ifdef CONFIG_COMPAT
2176int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2177                          char __user *optval, int optlen)
2178{
2179        if (level != SOL_TCP)
2180                return inet_csk_compat_setsockopt(sk, level, optname,
2181                                                  optval, optlen);
2182        return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2183}
2184
2185EXPORT_SYMBOL(compat_tcp_setsockopt);
2186#endif
2187
2188/* Return information about state of tcp endpoint in API format. */
2189void tcp_get_info(struct sock *sk, struct tcp_info *info)
2190{
2191        struct tcp_sock *tp = tcp_sk(sk);
2192        const struct inet_connection_sock *icsk = inet_csk(sk);
2193        u32 now = tcp_time_stamp;
2194
2195        memset(info, 0, sizeof(*info));
2196
2197        info->tcpi_state = sk->sk_state;
2198        info->tcpi_ca_state = icsk->icsk_ca_state;
2199        info->tcpi_retransmits = icsk->icsk_retransmits;
2200        info->tcpi_probes = icsk->icsk_probes_out;
2201        info->tcpi_backoff = icsk->icsk_backoff;
2202
2203        if (tp->rx_opt.tstamp_ok)
2204                info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2205        if (tcp_is_sack(tp))
2206                info->tcpi_options |= TCPI_OPT_SACK;
2207        if (tp->rx_opt.wscale_ok) {
2208                info->tcpi_options |= TCPI_OPT_WSCALE;
2209                info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2210                info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2211        }
2212
2213        if (tp->ecn_flags&TCP_ECN_OK)
2214                info->tcpi_options |= TCPI_OPT_ECN;
2215
2216        info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2217        info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2218        info->tcpi_snd_mss = tp->mss_cache;
2219        info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2220
2221        if (sk->sk_state == TCP_LISTEN) {
2222                info->tcpi_unacked = sk->sk_ack_backlog;
2223                info->tcpi_sacked = sk->sk_max_ack_backlog;
2224        } else {
2225                info->tcpi_unacked = tp->packets_out;
2226                info->tcpi_sacked = tp->sacked_out;
2227        }
2228        info->tcpi_lost = tp->lost_out;
2229        info->tcpi_retrans = tp->retrans_out;
2230        info->tcpi_fackets = tp->fackets_out;
2231
2232        info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2233        info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2234        info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2235
2236        info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
2237        info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2238        info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2239        info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2240        info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2241        info->tcpi_snd_cwnd = tp->snd_cwnd;
2242        info->tcpi_advmss = tp->advmss;
2243        info->tcpi_reordering = tp->reordering;
2244
2245        info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2246        info->tcpi_rcv_space = tp->rcvq_space.space;
2247
2248        info->tcpi_total_retrans = tp->total_retrans;
2249}
2250
2251EXPORT_SYMBOL_GPL(tcp_get_info);
2252
2253static int do_tcp_getsockopt(struct sock *sk, int level,
2254                int optname, char __user *optval, int __user *optlen)
2255{
2256        struct inet_connection_sock *icsk = inet_csk(sk);
2257        struct tcp_sock *tp = tcp_sk(sk);
2258        int val, len;
2259
2260        if (get_user(len, optlen))
2261                return -EFAULT;
2262
2263        len = min_t(unsigned int, len, sizeof(int));
2264
2265        if (len < 0)
2266                return -EINVAL;
2267
2268        switch (optname) {
2269        case TCP_MAXSEG:
2270                val = tp->mss_cache;
2271                if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2272                        val = tp->rx_opt.user_mss;
2273                break;
2274        case TCP_NODELAY:
2275                val = !!(tp->nonagle&TCP_NAGLE_OFF);
2276                break;
2277        case TCP_CORK:
2278                val = !!(tp->nonagle&TCP_NAGLE_CORK);
2279                break;
2280        case TCP_KEEPIDLE:
2281                val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2282                break;
2283        case TCP_KEEPINTVL:
2284                val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2285                break;
2286        case TCP_KEEPCNT:
2287                val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2288                break;
2289        case TCP_SYNCNT:
2290                val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
2291                break;
2292        case TCP_LINGER2:
2293                val = tp->linger2;
2294                if (val >= 0)
2295                        val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2296                break;
2297        case TCP_DEFER_ACCEPT:
2298                val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
2299                        ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
2300                break;
2301        case TCP_WINDOW_CLAMP:
2302                val = tp->window_clamp;
2303                break;
2304        case TCP_INFO: {
2305                struct tcp_info info;
2306
2307                if (get_user(len, optlen))
2308                        return -EFAULT;
2309
2310                tcp_get_info(sk, &info);
2311
2312                len = min_t(unsigned int, len, sizeof(info));
2313                if (put_user(len, optlen))
2314                        return -EFAULT;
2315                if (copy_to_user(optval, &info, len))
2316                        return -EFAULT;
2317                return 0;
2318        }
2319        case TCP_QUICKACK:
2320                val = !icsk->icsk_ack.pingpong;
2321                break;
2322
2323        case TCP_CONGESTION:
2324                if (get_user(len, optlen))
2325                        return -EFAULT;
2326                len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2327                if (put_user(len, optlen))
2328                        return -EFAULT;
2329                if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2330                        return -EFAULT;
2331                return 0;
2332        default:
2333                return -ENOPROTOOPT;
2334        }
2335
2336        if (put_user(len, optlen))
2337                return -EFAULT;
2338        if (copy_to_user(optval, &val, len))
2339                return -EFAULT;
2340        return 0;
2341}
2342
2343int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2344                   int __user *optlen)
2345{
2346        struct inet_connection_sock *icsk = inet_csk(sk);
2347
2348        if (level != SOL_TCP)
2349                return icsk->icsk_af_ops->getsockopt(sk, level, optname,
2350                                                     optval, optlen);
2351        return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2352}
2353
2354#ifdef CONFIG_COMPAT
2355int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2356                          char __user *optval, int __user *optlen)
2357{
2358        if (level != SOL_TCP)
2359                return inet_csk_compat_getsockopt(sk, level, optname,
2360                                                  optval, optlen);
2361        return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2362}
2363
2364EXPORT_SYMBOL(compat_tcp_getsockopt);
2365#endif
2366
2367struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
2368{
2369        struct sk_buff *segs = ERR_PTR(-EINVAL);
2370        struct tcphdr *th;
2371        unsigned thlen;
2372        unsigned int seq;
2373        __be32 delta;
2374        unsigned int oldlen;
2375        unsigned int len;
2376
2377        if (!pskb_may_pull(skb, sizeof(*th)))
2378                goto out;
2379
2380        th = tcp_hdr(skb);
2381        thlen = th->doff * 4;
2382        if (thlen < sizeof(*th))
2383                goto out;
2384
2385        if (!pskb_may_pull(skb, thlen))
2386                goto out;
2387
2388        oldlen = (u16)~skb->len;
2389        __skb_pull(skb, thlen);
2390
2391        if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
2392                /* Packet is from an untrusted source, reset gso_segs. */
2393                int type = skb_shinfo(skb)->gso_type;
2394                int mss;
2395
2396                if (unlikely(type &
2397                             ~(SKB_GSO_TCPV4 |
2398                               SKB_GSO_DODGY |
2399                               SKB_GSO_TCP_ECN |
2400                               SKB_GSO_TCPV6 |
2401                               0) ||
2402                             !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
2403                        goto out;
2404
2405                mss = skb_shinfo(skb)->gso_size;
2406                skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
2407
2408                segs = NULL;
2409                goto out;
2410        }
2411
2412        segs = skb_segment(skb, features);
2413        if (IS_ERR(segs))
2414                goto out;
2415
2416        len = skb_shinfo(skb)->gso_size;
2417        delta = htonl(oldlen + (thlen + len));
2418
2419        skb = segs;
2420        th = tcp_hdr(skb);
2421        seq = ntohl(th->seq);
2422
2423        do {
2424                th->fin = th->psh = 0;
2425
2426                th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
2427                                       (__force u32)delta));
2428                if (skb->ip_summed != CHECKSUM_PARTIAL)
2429                        th->check =
2430                             csum_fold(csum_partial(skb_transport_header(skb),
2431                                                    thlen, skb->csum));
2432
2433                seq += len;
2434                skb = skb->next;
2435                th = tcp_hdr(skb);
2436
2437                th->seq = htonl(seq);
2438                th->cwr = 0;
2439        } while (skb->next);
2440
2441        delta = htonl(oldlen + (skb->tail - skb->transport_header) +
2442                      skb->data_len);
2443        th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
2444                                (__force u32)delta));
2445        if (skb->ip_summed != CHECKSUM_PARTIAL)
2446                th->check = csum_fold(csum_partial(skb_transport_header(skb),
2447                                                   thlen, skb->csum));
2448
2449out:
2450        return segs;
2451}
2452EXPORT_SYMBOL(tcp_tso_segment);
2453
2454#ifdef CONFIG_TCP_MD5SIG
2455static unsigned long tcp_md5sig_users;
2456static struct tcp_md5sig_pool **tcp_md5sig_pool;
2457static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
2458
2459static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool)
2460{
2461        int cpu;
2462        for_each_possible_cpu(cpu) {
2463                struct tcp_md5sig_pool *p = *per_cpu_ptr(pool, cpu);
2464                if (p) {
2465                        if (p->md5_desc.tfm)
2466                                crypto_free_hash(p->md5_desc.tfm);
2467                        kfree(p);
2468                        p = NULL;
2469                }
2470        }
2471        free_percpu(pool);
2472}
2473
2474void tcp_free_md5sig_pool(void)
2475{
2476        struct tcp_md5sig_pool **pool = NULL;
2477
2478        spin_lock_bh(&tcp_md5sig_pool_lock);
2479        if (--tcp_md5sig_users == 0) {
2480                pool = tcp_md5sig_pool;
2481                tcp_md5sig_pool = NULL;
2482        }
2483        spin_unlock_bh(&tcp_md5sig_pool_lock);
2484        if (pool)
2485                __tcp_free_md5sig_pool(pool);
2486}
2487
2488EXPORT_SYMBOL(tcp_free_md5sig_pool);
2489
2490static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(void)
2491{
2492        int cpu;
2493        struct tcp_md5sig_pool **pool;
2494
2495        pool = alloc_percpu(struct tcp_md5sig_pool *);
2496        if (!pool)
2497                return NULL;
2498
2499        for_each_possible_cpu(cpu) {
2500                struct tcp_md5sig_pool *p;
2501                struct crypto_hash *hash;
2502
2503                p = kzalloc(sizeof(*p), GFP_KERNEL);
2504                if (!p)
2505                        goto out_free;
2506                *per_cpu_ptr(pool, cpu) = p;
2507
2508                hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
2509                if (!hash || IS_ERR(hash))
2510                        goto out_free;
2511
2512                p->md5_desc.tfm = hash;
2513        }
2514        return pool;
2515out_free:
2516        __tcp_free_md5sig_pool(pool);
2517        return NULL;
2518}
2519
2520struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(void)
2521{
2522        struct tcp_md5sig_pool **pool;
2523        int alloc = 0;
2524
2525retry:
2526        spin_lock_bh(&tcp_md5sig_pool_lock);
2527        pool = tcp_md5sig_pool;
2528        if (tcp_md5sig_users++ == 0) {
2529                alloc = 1;
2530                spin_unlock_bh(&tcp_md5sig_pool_lock);
2531        } else if (!pool) {
2532                tcp_md5sig_users--;
2533                spin_unlock_bh(&tcp_md5sig_pool_lock);
2534                cpu_relax();
2535                goto retry;
2536        } else
2537                spin_unlock_bh(&tcp_md5sig_pool_lock);
2538
2539        if (alloc) {
2540                /* we cannot hold spinlock here because this may sleep. */
2541                struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool();
2542                spin_lock_bh(&tcp_md5sig_pool_lock);
2543                if (!p) {
2544                        tcp_md5sig_users--;
2545                        spin_unlock_bh(&tcp_md5sig_pool_lock);
2546                        return NULL;
2547                }
2548                pool = tcp_md5sig_pool;
2549                if (pool) {
2550                        /* oops, it has already been assigned. */
2551                        spin_unlock_bh(&tcp_md5sig_pool_lock);
2552                        __tcp_free_md5sig_pool(p);
2553                } else {
2554                        tcp_md5sig_pool = pool = p;
2555                        spin_unlock_bh(&tcp_md5sig_pool_lock);
2556                }
2557        }
2558        return pool;
2559}
2560
2561EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
2562
2563struct tcp_md5sig_pool *__tcp_get_md5sig_pool(int cpu)
2564{
2565        struct tcp_md5sig_pool **p;
2566        spin_lock_bh(&tcp_md5sig_pool_lock);
2567        p = tcp_md5sig_pool;
2568        if (p)
2569                tcp_md5sig_users++;
2570        spin_unlock_bh(&tcp_md5sig_pool_lock);
2571        return (p ? *per_cpu_ptr(p, cpu) : NULL);
2572}
2573
2574EXPORT_SYMBOL(__tcp_get_md5sig_pool);
2575
2576void __tcp_put_md5sig_pool(void)
2577{
2578        tcp_free_md5sig_pool();
2579}
2580
2581EXPORT_SYMBOL(__tcp_put_md5sig_pool);
2582#endif
2583
2584void tcp_done(struct sock *sk)
2585{
2586        if(sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
2587                TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
2588
2589        tcp_set_state(sk, TCP_CLOSE);
2590        tcp_clear_xmit_timers(sk);
2591
2592        sk->sk_shutdown = SHUTDOWN_MASK;
2593
2594        if (!sock_flag(sk, SOCK_DEAD))
2595                sk->sk_state_change(sk);
2596        else
2597                inet_csk_destroy_sock(sk);
2598}
2599EXPORT_SYMBOL_GPL(tcp_done);
2600
2601extern struct tcp_congestion_ops tcp_reno;
2602
2603static __initdata unsigned long thash_entries;
2604static int __init set_thash_entries(char *str)
2605{
2606        if (!str)
2607                return 0;
2608        thash_entries = simple_strtoul(str, &str, 0);
2609        return 1;
2610}
2611__setup("thash_entries=", set_thash_entries);
2612
2613void __init tcp_init(void)
2614{
2615        struct sk_buff *skb = NULL;
2616        unsigned long limit;
2617        int order, i, max_share;
2618
2619        BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
2620
2621        tcp_hashinfo.bind_bucket_cachep =
2622                kmem_cache_create("tcp_bind_bucket",
2623                                  sizeof(struct inet_bind_bucket), 0,
2624                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2625
2626        /* Size and allocate the main established and bind bucket
2627         * hash tables.
2628         *
2629         * The methodology is similar to that of the buffer cache.
2630         */
2631        tcp_hashinfo.ehash =
2632                alloc_large_system_hash("TCP established",
2633                                        sizeof(struct inet_ehash_bucket),
2634                                        thash_entries,
2635                                        (num_physpages >= 128 * 1024) ?
2636                                        13 : 15,
2637                                        0,
2638                                        &tcp_hashinfo.ehash_size,
2639                                        NULL,
2640                                        thash_entries ? 0 : 512 * 1024);
2641        tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size;
2642        for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
2643                INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
2644                INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain);
2645        }
2646        if (inet_ehash_locks_alloc(&tcp_hashinfo))
2647                panic("TCP: failed to alloc ehash_locks");
2648        tcp_hashinfo.bhash =
2649                alloc_large_system_hash("TCP bind",
2650                                        sizeof(struct inet_bind_hashbucket),
2651                                        tcp_hashinfo.ehash_size,
2652                                        (num_physpages >= 128 * 1024) ?
2653                                        13 : 15,
2654                                        0,
2655                                        &tcp_hashinfo.bhash_size,
2656                                        NULL,
2657                                        64 * 1024);
2658        tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
2659        for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
2660                spin_lock_init(&tcp_hashinfo.bhash[i].lock);
2661                INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
2662        }
2663
2664        /* Try to be a bit smarter and adjust defaults depending
2665         * on available memory.
2666         */
2667        for (order = 0; ((1 << order) << PAGE_SHIFT) <
2668                        (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
2669                        order++)
2670                ;
2671        if (order >= 4) {
2672                tcp_death_row.sysctl_max_tw_buckets = 180000;
2673                sysctl_tcp_max_orphans = 4096 << (order - 4);
2674                sysctl_max_syn_backlog = 1024;
2675        } else if (order < 3) {
2676                tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
2677                sysctl_tcp_max_orphans >>= (3 - order);
2678                sysctl_max_syn_backlog = 128;
2679        }
2680
2681        /* Set the pressure threshold to be a fraction of global memory that
2682         * is up to 1/2 at 256 MB, decreasing toward zero with the amount of
2683         * memory, with a floor of 128 pages.
2684         */
2685        limit = min(nr_all_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
2686        limit = (limit * (nr_all_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
2687        limit = max(limit, 128UL);
2688        sysctl_tcp_mem[0] = limit / 4 * 3;
2689        sysctl_tcp_mem[1] = limit;
2690        sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
2691
2692        /* Set per-socket limits to no more than 1/128 the pressure threshold */
2693        limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
2694        max_share = min(4UL*1024*1024, limit);
2695
2696        sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
2697        sysctl_tcp_wmem[1] = 16*1024;
2698        sysctl_tcp_wmem[2] = max(64*1024, max_share);
2699
2700        sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
2701        sysctl_tcp_rmem[1] = 87380;
2702        sysctl_tcp_rmem[2] = max(87380, max_share);
2703
2704        printk(KERN_INFO "TCP: Hash tables configured "
2705               "(established %d bind %d)\n",
2706               tcp_hashinfo.ehash_size, tcp_hashinfo.bhash_size);
2707
2708        tcp_register_congestion_control(&tcp_reno);
2709}
2710
2711EXPORT_SYMBOL(tcp_close);
2712EXPORT_SYMBOL(tcp_disconnect);
2713EXPORT_SYMBOL(tcp_getsockopt);
2714EXPORT_SYMBOL(tcp_ioctl);
2715EXPORT_SYMBOL(tcp_poll);
2716EXPORT_SYMBOL(tcp_read_sock);
2717EXPORT_SYMBOL(tcp_recvmsg);
2718EXPORT_SYMBOL(tcp_sendmsg);
2719EXPORT_SYMBOL(tcp_splice_read);
2720EXPORT_SYMBOL(tcp_sendpage);
2721EXPORT_SYMBOL(tcp_setsockopt);
2722EXPORT_SYMBOL(tcp_shutdown);
2723EXPORT_SYMBOL(tcp_statistics);
2724
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.