linux/net/ipv4/tcp.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  11 *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  14 *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  15 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  16 *              Matthew Dillon, <dillon@apollo.west.oic.com>
  17 *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18 *              Jorge Cwik, <jorge@laser.satlink.net>
  19 *
  20 * Fixes:
  21 *              Alan Cox        :       Numerous verify_area() calls
  22 *              Alan Cox        :       Set the ACK bit on a reset
  23 *              Alan Cox        :       Stopped it crashing if it closed while
  24 *                                      sk->inuse=1 and was trying to connect
  25 *                                      (tcp_err()).
  26 *              Alan Cox        :       All icmp error handling was broken
  27 *                                      pointers passed where wrong and the
  28 *                                      socket was looked up backwards. Nobody
  29 *                                      tested any icmp error code obviously.
  30 *              Alan Cox        :       tcp_err() now handled properly. It
  31 *                                      wakes people on errors. poll
  32 *                                      behaves and the icmp error race
  33 *                                      has gone by moving it into sock.c
  34 *              Alan Cox        :       tcp_send_reset() fixed to work for
  35 *                                      everything not just packets for
  36 *                                      unknown sockets.
  37 *              Alan Cox        :       tcp option processing.
  38 *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  39 *                                      syn rule wrong]
  40 *              Herp Rosmanith  :       More reset fixes
  41 *              Alan Cox        :       No longer acks invalid rst frames.
  42 *                                      Acking any kind of RST is right out.
  43 *              Alan Cox        :       Sets an ignore me flag on an rst
  44 *                                      receive otherwise odd bits of prattle
  45 *                                      escape still
  46 *              Alan Cox        :       Fixed another acking RST frame bug.
  47 *                                      Should stop LAN workplace lockups.
  48 *              Alan Cox        :       Some tidyups using the new skb list
  49 *                                      facilities
  50 *              Alan Cox        :       sk->keepopen now seems to work
  51 *              Alan Cox        :       Pulls options out correctly on accepts
  52 *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  53 *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  54 *                                      bit to skb ops.
  55 *              Alan Cox        :       Tidied tcp_data to avoid a potential
  56 *                                      nasty.
  57 *              Alan Cox        :       Added some better commenting, as the
  58 *                                      tcp is hard to follow
  59 *              Alan Cox        :       Removed incorrect check for 20 * psh
  60 *      Michael O'Reilly        :       ack < copied bug fix.
  61 *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  62 *              Alan Cox        :       FIN with no memory -> CRASH
  63 *              Alan Cox        :       Added socket option proto entries.
  64 *                                      Also added awareness of them to accept.
  65 *              Alan Cox        :       Added TCP options (SOL_TCP)
  66 *              Alan Cox        :       Switched wakeup calls to callbacks,
  67 *                                      so the kernel can layer network
  68 *                                      sockets.
  69 *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  70 *              Alan Cox        :       Handle FIN (more) properly (we hope).
  71 *              Alan Cox        :       RST frames sent on unsynchronised
  72 *                                      state ack error.
  73 *              Alan Cox        :       Put in missing check for SYN bit.
  74 *              Alan Cox        :       Added tcp_select_window() aka NET2E
  75 *                                      window non shrink trick.
  76 *              Alan Cox        :       Added a couple of small NET2E timer
  77 *                                      fixes
  78 *              Charles Hedrick :       TCP fixes
  79 *              Toomas Tamm     :       TCP window fixes
  80 *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  81 *              Charles Hedrick :       Rewrote most of it to actually work
  82 *              Linus           :       Rewrote tcp_read() and URG handling
  83 *                                      completely
  84 *              Gerhard Koerting:       Fixed some missing timer handling
  85 *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  86 *              Gerhard Koerting:       PC/TCP workarounds
  87 *              Adam Caldwell   :       Assorted timer/timing errors
  88 *              Matthew Dillon  :       Fixed another RST bug
  89 *              Alan Cox        :       Move to kernel side addressing changes.
  90 *              Alan Cox        :       Beginning work on TCP fastpathing
  91 *                                      (not yet usable)
  92 *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  93 *              Alan Cox        :       TCP fast path debugging
  94 *              Alan Cox        :       Window clamping
  95 *              Michael Riepe   :       Bug in tcp_check()
  96 *              Matt Dillon     :       More TCP improvements and RST bug fixes
  97 *              Matt Dillon     :       Yet more small nasties remove from the
  98 *                                      TCP code (Be very nice to this man if
  99 *                                      tcp finally works 100%) 8)
 100 *              Alan Cox        :       BSD accept semantics.
 101 *              Alan Cox        :       Reset on closedown bug.
 102 *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 103 *              Michael Pall    :       Handle poll() after URG properly in
 104 *                                      all cases.
 105 *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 106 *                                      (multi URG PUSH broke rlogin).
 107 *              Michael Pall    :       Fix the multi URG PUSH problem in
 108 *                                      tcp_readable(), poll() after URG
 109 *                                      works now.
 110 *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 111 *                                      BSD api.
 112 *              Alan Cox        :       Changed the semantics of sk->socket to
 113 *                                      fix a race and a signal problem with
 114 *                                      accept() and async I/O.
 115 *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 116 *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 117 *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 118 *                                      clients/servers which listen in on
 119 *                                      fixed ports.
 120 *              Alan Cox        :       Cleaned the above up and shrank it to
 121 *                                      a sensible code size.
 122 *              Alan Cox        :       Self connect lockup fix.
 123 *              Alan Cox        :       No connect to multicast.
 124 *              Ross Biro       :       Close unaccepted children on master
 125 *                                      socket close.
 126 *              Alan Cox        :       Reset tracing code.
 127 *              Alan Cox        :       Spurious resets on shutdown.
 128 *              Alan Cox        :       Giant 15 minute/60 second timer error
 129 *              Alan Cox        :       Small whoops in polling before an
 130 *                                      accept.
 131 *              Alan Cox        :       Kept the state trace facility since
 132 *                                      it's handy for debugging.
 133 *              Alan Cox        :       More reset handler fixes.
 134 *              Alan Cox        :       Started rewriting the code based on
 135 *                                      the RFC's for other useful protocol
 136 *                                      references see: Comer, KA9Q NOS, and
 137 *                                      for a reference on the difference
 138 *                                      between specifications and how BSD
 139 *                                      works see the 4.4lite source.
 140 *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 141 *                                      close.
 142 *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 143 *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 144 *              Alan Cox        :       Reimplemented timers as per the RFC
 145 *                                      and using multiple timers for sanity.
 146 *              Alan Cox        :       Small bug fixes, and a lot of new
 147 *                                      comments.
 148 *              Alan Cox        :       Fixed dual reader crash by locking
 149 *                                      the buffers (much like datagram.c)
 150 *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 151 *                                      now gets fed up of retrying without
 152 *                                      (even a no space) answer.
 153 *              Alan Cox        :       Extracted closing code better
 154 *              Alan Cox        :       Fixed the closing state machine to
 155 *                                      resemble the RFC.
 156 *              Alan Cox        :       More 'per spec' fixes.
 157 *              Jorge Cwik      :       Even faster checksumming.
 158 *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 159 *                                      only frames. At least one pc tcp stack
 160 *                                      generates them.
 161 *              Alan Cox        :       Cache last socket.
 162 *              Alan Cox        :       Per route irtt.
 163 *              Matt Day        :       poll()->select() match BSD precisely on error
 164 *              Alan Cox        :       New buffers
 165 *              Marc Tamsky     :       Various sk->prot->retransmits and
 166 *                                      sk->retransmits misupdating fixed.
 167 *                                      Fixed tcp_write_timeout: stuck close,
 168 *                                      and TCP syn retries gets used now.
 169 *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 170 *                                      ack if state is TCP_CLOSED.
 171 *              Alan Cox        :       Look up device on a retransmit - routes may
 172 *                                      change. Doesn't yet cope with MSS shrink right
 173 *                                      but it's a start!
 174 *              Marc Tamsky     :       Closing in closing fixes.
 175 *              Mike Shaver     :       RFC1122 verifications.
 176 *              Alan Cox        :       rcv_saddr errors.
 177 *              Alan Cox        :       Block double connect().
 178 *              Alan Cox        :       Small hooks for enSKIP.
 179 *              Alexey Kuznetsov:       Path MTU discovery.
 180 *              Alan Cox        :       Support soft errors.
 181 *              Alan Cox        :       Fix MTU discovery pathological case
 182 *                                      when the remote claims no mtu!
 183 *              Marc Tamsky     :       TCP_CLOSE fix.
 184 *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 185 *                                      window but wrong (fixes NT lpd problems)
 186 *              Pedro Roque     :       Better TCP window handling, delayed ack.
 187 *              Joerg Reuter    :       No modification of locked buffers in
 188 *                                      tcp_do_retransmit()
 189 *              Eric Schenk     :       Changed receiver side silly window
 190 *                                      avoidance algorithm to BSD style
 191 *                                      algorithm. This doubles throughput
 192 *                                      against machines running Solaris,
 193 *                                      and seems to result in general
 194 *                                      improvement.
 195 *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 196 *      Willy Konynenberg       :       Transparent proxying support.
 197 *      Mike McLagan            :       Routing by source
 198 *              Keith Owens     :       Do proper merging with partial SKB's in
 199 *                                      tcp_do_sendmsg to avoid burstiness.
 200 *              Eric Schenk     :       Fix fast close down bug with
 201 *                                      shutdown() followed by close().
 202 *              Andi Kleen      :       Make poll agree with SIGIO
 203 *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
 204 *                                      lingertime == 0 (RFC 793 ABORT Call)
 205 *      Hirokazu Takahashi      :       Use copy_from_user() instead of
 206 *                                      csum_and_copy_from_user() if possible.
 207 *
 208 *              This program is free software; you can redistribute it and/or
 209 *              modify it under the terms of the GNU General Public License
 210 *              as published by the Free Software Foundation; either version
 211 *              2 of the License, or(at your option) any later version.
 212 *
 213 * Description of States:
 214 *
 215 *      TCP_SYN_SENT            sent a connection request, waiting for ack
 216 *
 217 *      TCP_SYN_RECV            received a connection request, sent ack,
 218 *                              waiting for final ack in three-way handshake.
 219 *
 220 *      TCP_ESTABLISHED         connection established
 221 *
 222 *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 223 *                              transmission of remaining buffered data
 224 *
 225 *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 226 *                              to shutdown
 227 *
 228 *      TCP_CLOSING             both sides have shutdown but we still have
 229 *                              data we have to finish sending
 230 *
 231 *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 232 *                              closed, can only be entered from FIN_WAIT2
 233 *                              or CLOSING.  Required because the other end
 234 *                              may not have gotten our last ACK causing it
 235 *                              to retransmit the data packet (which we ignore)
 236 *
 237 *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 238 *                              us to finish writing our data and to shutdown
 239 *                              (we have to close() to move on to LAST_ACK)
 240 *
 241 *      TCP_LAST_ACK            out side has shutdown after remote has
 242 *                              shutdown.  There may still be data in our
 243 *                              buffer that we have to finish sending
 244 *
 245 *      TCP_CLOSE               socket is finished
 246 */
 247
 248#define pr_fmt(fmt) "TCP: " fmt
 249
 250#include <crypto/hash.h>
 251#include <linux/kernel.h>
 252#include <linux/module.h>
 253#include <linux/types.h>
 254#include <linux/fcntl.h>
 255#include <linux/poll.h>
 256#include <linux/inet_diag.h>
 257#include <linux/init.h>
 258#include <linux/fs.h>
 259#include <linux/skbuff.h>
 260#include <linux/scatterlist.h>
 261#include <linux/splice.h>
 262#include <linux/net.h>
 263#include <linux/socket.h>
 264#include <linux/random.h>
 265#include <linux/bootmem.h>
 266#include <linux/highmem.h>
 267#include <linux/swap.h>
 268#include <linux/cache.h>
 269#include <linux/err.h>
 270#include <linux/time.h>
 271#include <linux/slab.h>
 272
 273#include <net/icmp.h>
 274#include <net/inet_common.h>
 275#include <net/tcp.h>
 276#include <net/xfrm.h>
 277#include <net/ip.h>
 278#include <net/sock.h>
 279
 280#include <asm/uaccess.h>
 281#include <asm/ioctls.h>
 282#include <asm/unaligned.h>
 283#include <net/busy_poll.h>
 284
 285int sysctl_tcp_min_tso_segs __read_mostly = 2;
 286
 287int sysctl_tcp_autocorking __read_mostly = 1;
 288
 289struct percpu_counter tcp_orphan_count;
 290EXPORT_SYMBOL_GPL(tcp_orphan_count);
 291
 292long sysctl_tcp_mem[3] __read_mostly;
 293int sysctl_tcp_wmem[3] __read_mostly;
 294int sysctl_tcp_rmem[3] __read_mostly;
 295
 296EXPORT_SYMBOL(sysctl_tcp_mem);
 297EXPORT_SYMBOL(sysctl_tcp_rmem);
 298EXPORT_SYMBOL(sysctl_tcp_wmem);
 299
 300atomic_long_t tcp_memory_allocated;     /* Current allocated memory. */
 301EXPORT_SYMBOL(tcp_memory_allocated);
 302
 303/*
 304 * Current number of TCP sockets.
 305 */
 306struct percpu_counter tcp_sockets_allocated;
 307EXPORT_SYMBOL(tcp_sockets_allocated);
 308
 309/*
 310 * TCP splice context
 311 */
 312struct tcp_splice_state {
 313        struct pipe_inode_info *pipe;
 314        size_t len;
 315        unsigned int flags;
 316};
 317
 318/*
 319 * Pressure flag: try to collapse.
 320 * Technical note: it is used by multiple contexts non atomically.
 321 * All the __sk_mem_schedule() is of this nature: accounting
 322 * is strict, actions are advisory and have some latency.
 323 */
 324int tcp_memory_pressure __read_mostly;
 325EXPORT_SYMBOL(tcp_memory_pressure);
 326
 327void tcp_enter_memory_pressure(struct sock *sk)
 328{
 329        if (!tcp_memory_pressure) {
 330                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
 331                tcp_memory_pressure = 1;
 332        }
 333}
 334EXPORT_SYMBOL(tcp_enter_memory_pressure);
 335
 336/* Convert seconds to retransmits based on initial and max timeout */
 337static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
 338{
 339        u8 res = 0;
 340
 341        if (seconds > 0) {
 342                int period = timeout;
 343
 344                res = 1;
 345                while (seconds > period && res < 255) {
 346                        res++;
 347                        timeout <<= 1;
 348                        if (timeout > rto_max)
 349                                timeout = rto_max;
 350                        period += timeout;
 351                }
 352        }
 353        return res;
 354}
 355
 356/* Convert retransmits to seconds based on initial and max timeout */
 357static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
 358{
 359        int period = 0;
 360
 361        if (retrans > 0) {
 362                period = timeout;
 363                while (--retrans) {
 364                        timeout <<= 1;
 365                        if (timeout > rto_max)
 366                                timeout = rto_max;
 367                        period += timeout;
 368                }
 369        }
 370        return period;
 371}
 372
 373/* Address-family independent initialization for a tcp_sock.
 374 *
 375 * NOTE: A lot of things set to zero explicitly by call to
 376 *       sk_alloc() so need not be done here.
 377 */
 378void tcp_init_sock(struct sock *sk)
 379{
 380        struct inet_connection_sock *icsk = inet_csk(sk);
 381        struct tcp_sock *tp = tcp_sk(sk);
 382
 383        tp->out_of_order_queue = RB_ROOT;
 384        tcp_init_xmit_timers(sk);
 385        tcp_prequeue_init(tp);
 386        INIT_LIST_HEAD(&tp->tsq_node);
 387
 388        icsk->icsk_rto = TCP_TIMEOUT_INIT;
 389        tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
 390        minmax_reset(&tp->rtt_min, tcp_time_stamp, ~0U);
 391
 392        /* So many TCP implementations out there (incorrectly) count the
 393         * initial SYN frame in their delayed-ACK and congestion control
 394         * algorithms that we must have the following bandaid to talk
 395         * efficiently to them.  -DaveM
 396         */
 397        tp->snd_cwnd = TCP_INIT_CWND;
 398
 399        /* There's a bubble in the pipe until at least the first ACK. */
 400        tp->app_limited = ~0U;
 401
 402        /* See draft-stevens-tcpca-spec-01 for discussion of the
 403         * initialization of these values.
 404         */
 405        tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 406        tp->snd_cwnd_clamp = ~0;
 407        tp->mss_cache = TCP_MSS_DEFAULT;
 408        u64_stats_init(&tp->syncp);
 409
 410        tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
 411        tcp_enable_early_retrans(tp);
 412        tcp_assign_congestion_control(sk);
 413
 414        tp->tsoffset = 0;
 415
 416        sk->sk_state = TCP_CLOSE;
 417
 418        sk->sk_write_space = sk_stream_write_space;
 419        sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
 420
 421        icsk->icsk_sync_mss = tcp_sync_mss;
 422
 423        sk->sk_sndbuf = sysctl_tcp_wmem[1];
 424        sk->sk_rcvbuf = sysctl_tcp_rmem[1];
 425
 426        local_bh_disable();
 427        sk_sockets_allocated_inc(sk);
 428        local_bh_enable();
 429}
 430EXPORT_SYMBOL(tcp_init_sock);
 431
 432static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb)
 433{
 434        if (tsflags) {
 435                struct skb_shared_info *shinfo = skb_shinfo(skb);
 436                struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 437
 438                sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags);
 439                if (tsflags & SOF_TIMESTAMPING_TX_ACK)
 440                        tcb->txstamp_ack = 1;
 441                if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
 442                        shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
 443        }
 444}
 445
 446/*
 447 *      Wait for a TCP event.
 448 *
 449 *      Note that we don't need to lock the socket, as the upper poll layers
 450 *      take care of normal races (between the test and the event) and we don't
 451 *      go look at any of the socket buffers directly.
 452 */
 453unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 454{
 455        unsigned int mask;
 456        struct sock *sk = sock->sk;
 457        const struct tcp_sock *tp = tcp_sk(sk);
 458        int state;
 459
 460        sock_rps_record_flow(sk);
 461
 462        sock_poll_wait(file, sk_sleep(sk), wait);
 463
 464        state = sk_state_load(sk);
 465        if (state == TCP_LISTEN)
 466                return inet_csk_listen_poll(sk);
 467
 468        /* Socket is not locked. We are protected from async events
 469         * by poll logic and correct handling of state changes
 470         * made by other threads is impossible in any case.
 471         */
 472
 473        mask = 0;
 474
 475        /*
 476         * POLLHUP is certainly not done right. But poll() doesn't
 477         * have a notion of HUP in just one direction, and for a
 478         * socket the read side is more interesting.
 479         *
 480         * Some poll() documentation says that POLLHUP is incompatible
 481         * with the POLLOUT/POLLWR flags, so somebody should check this
 482         * all. But careful, it tends to be safer to return too many
 483         * bits than too few, and you can easily break real applications
 484         * if you don't tell them that something has hung up!
 485         *
 486         * Check-me.
 487         *
 488         * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
 489         * our fs/select.c). It means that after we received EOF,
 490         * poll always returns immediately, making impossible poll() on write()
 491         * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
 492         * if and only if shutdown has been made in both directions.
 493         * Actually, it is interesting to look how Solaris and DUX
 494         * solve this dilemma. I would prefer, if POLLHUP were maskable,
 495         * then we could set it on SND_SHUTDOWN. BTW examples given
 496         * in Stevens' books assume exactly this behaviour, it explains
 497         * why POLLHUP is incompatible with POLLOUT.    --ANK
 498         *
 499         * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
 500         * blocking on fresh not-connected or disconnected socket. --ANK
 501         */
 502        if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
 503                mask |= POLLHUP;
 504        if (sk->sk_shutdown & RCV_SHUTDOWN)
 505                mask |= POLLIN | POLLRDNORM | POLLRDHUP;
 506
 507        /* Connected or passive Fast Open socket? */
 508        if (state != TCP_SYN_SENT &&
 509            (state != TCP_SYN_RECV || tp->fastopen_rsk)) {
 510                int target = sock_rcvlowat(sk, 0, INT_MAX);
 511
 512                if (tp->urg_seq == tp->copied_seq &&
 513                    !sock_flag(sk, SOCK_URGINLINE) &&
 514                    tp->urg_data)
 515                        target++;
 516
 517                if (tp->rcv_nxt - tp->copied_seq >= target)
 518                        mask |= POLLIN | POLLRDNORM;
 519
 520                if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
 521                        if (sk_stream_is_writeable(sk)) {
 522                                mask |= POLLOUT | POLLWRNORM;
 523                        } else {  /* send SIGIO later */
 524                                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 525                                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 526
 527                                /* Race breaker. If space is freed after
 528                                 * wspace test but before the flags are set,
 529                                 * IO signal will be lost. Memory barrier
 530                                 * pairs with the input side.
 531                                 */
 532                                smp_mb__after_atomic();
 533                                if (sk_stream_is_writeable(sk))
 534                                        mask |= POLLOUT | POLLWRNORM;
 535                        }
 536                } else
 537                        mask |= POLLOUT | POLLWRNORM;
 538
 539                if (tp->urg_data & TCP_URG_VALID)
 540                        mask |= POLLPRI;
 541        }
 542        /* This barrier is coupled with smp_wmb() in tcp_reset() */
 543        smp_rmb();
 544        if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
 545                mask |= POLLERR;
 546
 547        return mask;
 548}
 549EXPORT_SYMBOL(tcp_poll);
 550
 551int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 552{
 553        struct tcp_sock *tp = tcp_sk(sk);
 554        int answ;
 555        bool slow;
 556
 557        switch (cmd) {
 558        case SIOCINQ:
 559                if (sk->sk_state == TCP_LISTEN)
 560                        return -EINVAL;
 561
 562                slow = lock_sock_fast(sk);
 563                answ = tcp_inq(sk);
 564                unlock_sock_fast(sk, slow);
 565                break;
 566        case SIOCATMARK:
 567                answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 568                break;
 569        case SIOCOUTQ:
 570                if (sk->sk_state == TCP_LISTEN)
 571                        return -EINVAL;
 572
 573                if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 574                        answ = 0;
 575                else
 576                        answ = tp->write_seq - tp->snd_una;
 577                break;
 578        case SIOCOUTQNSD:
 579                if (sk->sk_state == TCP_LISTEN)
 580                        return -EINVAL;
 581
 582                if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 583                        answ = 0;
 584                else
 585                        answ = tp->write_seq - tp->snd_nxt;
 586                break;
 587        default:
 588                return -ENOIOCTLCMD;
 589        }
 590
 591        return put_user(answ, (int __user *)arg);
 592}
 593EXPORT_SYMBOL(tcp_ioctl);
 594
 595static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
 596{
 597        TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
 598        tp->pushed_seq = tp->write_seq;
 599}
 600
 601static inline bool forced_push(const struct tcp_sock *tp)
 602{
 603        return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
 604}
 605
 606static void skb_entail(struct sock *sk, struct sk_buff *skb)
 607{
 608        struct tcp_sock *tp = tcp_sk(sk);
 609        struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 610
 611        skb->csum    = 0;
 612        tcb->seq     = tcb->end_seq = tp->write_seq;
 613        tcb->tcp_flags = TCPHDR_ACK;
 614        tcb->sacked  = 0;
 615        __skb_header_release(skb);
 616        tcp_add_write_queue_tail(sk, skb);
 617        sk->sk_wmem_queued += skb->truesize;
 618        sk_mem_charge(sk, skb->truesize);
 619        if (tp->nonagle & TCP_NAGLE_PUSH)
 620                tp->nonagle &= ~TCP_NAGLE_PUSH;
 621
 622        tcp_slow_start_after_idle_check(sk);
 623}
 624
 625static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
 626{
 627        if (flags & MSG_OOB)
 628                tp->snd_up = tp->write_seq;
 629}
 630
 631/* If a not yet filled skb is pushed, do not send it if
 632 * we have data packets in Qdisc or NIC queues :
 633 * Because TX completion will happen shortly, it gives a chance
 634 * to coalesce future sendmsg() payload into this skb, without
 635 * need for a timer, and with no latency trade off.
 636 * As packets containing data payload have a bigger truesize
 637 * than pure acks (dataless) packets, the last checks prevent
 638 * autocorking if we only have an ACK in Qdisc/NIC queues,
 639 * or if TX completion was delayed after we processed ACK packet.
 640 */
 641static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
 642                                int size_goal)
 643{
 644        return skb->len < size_goal &&
 645               sysctl_tcp_autocorking &&
 646               skb != tcp_write_queue_head(sk) &&
 647               atomic_read(&sk->sk_wmem_alloc) > skb->truesize;
 648}
 649
 650static void tcp_push(struct sock *sk, int flags, int mss_now,
 651                     int nonagle, int size_goal)
 652{
 653        struct tcp_sock *tp = tcp_sk(sk);
 654        struct sk_buff *skb;
 655
 656        if (!tcp_send_head(sk))
 657                return;
 658
 659        skb = tcp_write_queue_tail(sk);
 660        if (!(flags & MSG_MORE) || forced_push(tp))
 661                tcp_mark_push(tp, skb);
 662
 663        tcp_mark_urg(tp, flags);
 664
 665        if (tcp_should_autocork(sk, skb, size_goal)) {
 666
 667                /* avoid atomic op if TSQ_THROTTLED bit is already set */
 668                if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) {
 669                        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
 670                        set_bit(TSQ_THROTTLED, &tp->tsq_flags);
 671                }
 672                /* It is possible TX completion already happened
 673                 * before we set TSQ_THROTTLED.
 674                 */
 675                if (atomic_read(&sk->sk_wmem_alloc) > skb->truesize)
 676                        return;
 677        }
 678
 679        if (flags & MSG_MORE)
 680                nonagle = TCP_NAGLE_CORK;
 681
 682        __tcp_push_pending_frames(sk, mss_now, nonagle);
 683}
 684
 685static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
 686                                unsigned int offset, size_t len)
 687{
 688        struct tcp_splice_state *tss = rd_desc->arg.data;
 689        int ret;
 690
 691        ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
 692                              min(rd_desc->count, len), tss->flags);
 693        if (ret > 0)
 694                rd_desc->count -= ret;
 695        return ret;
 696}
 697
 698static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
 699{
 700        /* Store TCP splice context information in read_descriptor_t. */
 701        read_descriptor_t rd_desc = {
 702                .arg.data = tss,
 703                .count    = tss->len,
 704        };
 705
 706        return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
 707}
 708
 709/**
 710 *  tcp_splice_read - splice data from TCP socket to a pipe
 711 * @sock:       socket to splice from
 712 * @ppos:       position (not valid)
 713 * @pipe:       pipe to splice to
 714 * @len:        number of bytes to splice
 715 * @flags:      splice modifier flags
 716 *
 717 * Description:
 718 *    Will read pages from given socket and fill them into a pipe.
 719 *
 720 **/
 721ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
 722                        struct pipe_inode_info *pipe, size_t len,
 723                        unsigned int flags)
 724{
 725        struct sock *sk = sock->sk;
 726        struct tcp_splice_state tss = {
 727                .pipe = pipe,
 728                .len = len,
 729                .flags = flags,
 730        };
 731        long timeo;
 732        ssize_t spliced;
 733        int ret;
 734
 735        sock_rps_record_flow(sk);
 736        /*
 737         * We can't seek on a socket input
 738         */
 739        if (unlikely(*ppos))
 740                return -ESPIPE;
 741
 742        ret = spliced = 0;
 743
 744        lock_sock(sk);
 745
 746        timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
 747        while (tss.len) {
 748                ret = __tcp_splice_read(sk, &tss);
 749                if (ret < 0)
 750                        break;
 751                else if (!ret) {
 752                        if (spliced)
 753                                break;
 754                        if (sock_flag(sk, SOCK_DONE))
 755                                break;
 756                        if (sk->sk_err) {
 757                                ret = sock_error(sk);
 758                                break;
 759                        }
 760                        if (sk->sk_shutdown & RCV_SHUTDOWN)
 761                                break;
 762                        if (sk->sk_state == TCP_CLOSE) {
 763                                /*
 764                                 * This occurs when user tries to read
 765                                 * from never connected socket.
 766                                 */
 767                                if (!sock_flag(sk, SOCK_DONE))
 768                                        ret = -ENOTCONN;
 769                                break;
 770                        }
 771                        if (!timeo) {
 772                                ret = -EAGAIN;
 773                                break;
 774                        }
 775                        sk_wait_data(sk, &timeo, NULL);
 776                        if (signal_pending(current)) {
 777                                ret = sock_intr_errno(timeo);
 778                                break;
 779                        }
 780                        continue;
 781                }
 782                tss.len -= ret;
 783                spliced += ret;
 784
 785                if (!timeo)
 786                        break;
 787                release_sock(sk);
 788                lock_sock(sk);
 789
 790                if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
 791                    (sk->sk_shutdown & RCV_SHUTDOWN) ||
 792                    signal_pending(current))
 793                        break;
 794        }
 795
 796        release_sock(sk);
 797
 798        if (spliced)
 799                return spliced;
 800
 801        return ret;
 802}
 803EXPORT_SYMBOL(tcp_splice_read);
 804
 805struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
 806                                    bool force_schedule)
 807{
 808        struct sk_buff *skb;
 809
 810        /* The TCP header must be at least 32-bit aligned.  */
 811        size = ALIGN(size, 4);
 812
 813        if (unlikely(tcp_under_memory_pressure(sk)))
 814                sk_mem_reclaim_partial(sk);
 815
 816        skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
 817        if (likely(skb)) {
 818                bool mem_scheduled;
 819
 820                if (force_schedule) {
 821                        mem_scheduled = true;
 822                        sk_forced_mem_schedule(sk, skb->truesize);
 823                } else {
 824                        mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
 825                }
 826                if (likely(mem_scheduled)) {
 827                        skb_reserve(skb, sk->sk_prot->max_header);
 828                        /*
 829                         * Make sure that we have exactly size bytes
 830                         * available to the caller, no more, no less.
 831                         */
 832                        skb->reserved_tailroom = skb->end - skb->tail - size;
 833                        return skb;
 834                }
 835                __kfree_skb(skb);
 836        } else {
 837                sk->sk_prot->enter_memory_pressure(sk);
 838                sk_stream_moderate_sndbuf(sk);
 839        }
 840        return NULL;
 841}
 842
 843static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
 844                                       int large_allowed)
 845{
 846        struct tcp_sock *tp = tcp_sk(sk);
 847        u32 new_size_goal, size_goal;
 848
 849        if (!large_allowed || !sk_can_gso(sk))
 850                return mss_now;
 851
 852        /* Note : tcp_tso_autosize() will eventually split this later */
 853        new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
 854        new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
 855
 856        /* We try hard to avoid divides here */
 857        size_goal = tp->gso_segs * mss_now;
 858        if (unlikely(new_size_goal < size_goal ||
 859                     new_size_goal >= size_goal + mss_now)) {
 860                tp->gso_segs = min_t(u16, new_size_goal / mss_now,
 861                                     sk->sk_gso_max_segs);
 862                size_goal = tp->gso_segs * mss_now;
 863        }
 864
 865        return max(size_goal, mss_now);
 866}
 867
 868static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
 869{
 870        int mss_now;
 871
 872        mss_now = tcp_current_mss(sk);
 873        *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
 874
 875        return mss_now;
 876}
 877
 878static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
 879                                size_t size, int flags)
 880{
 881        struct tcp_sock *tp = tcp_sk(sk);
 882        int mss_now, size_goal;
 883        int err;
 884        ssize_t copied;
 885        long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 886
 887        /* Wait for a connection to finish. One exception is TCP Fast Open
 888         * (passive side) where data is allowed to be sent before a connection
 889         * is fully established.
 890         */
 891        if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
 892            !tcp_passive_fastopen(sk)) {
 893                err = sk_stream_wait_connect(sk, &timeo);
 894                if (err != 0)
 895                        goto out_err;
 896        }
 897
 898        sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 899
 900        mss_now = tcp_send_mss(sk, &size_goal, flags);
 901        copied = 0;
 902
 903        err = -EPIPE;
 904        if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 905                goto out_err;
 906
 907        while (size > 0) {
 908                struct sk_buff *skb = tcp_write_queue_tail(sk);
 909                int copy, i;
 910                bool can_coalesce;
 911
 912                if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 ||
 913                    !tcp_skb_can_collapse_to(skb)) {
 914new_segment:
 915                        if (!sk_stream_memory_free(sk))
 916                                goto wait_for_sndbuf;
 917
 918                        skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
 919                                                  skb_queue_empty(&sk->sk_write_queue));
 920                        if (!skb)
 921                                goto wait_for_memory;
 922
 923                        skb_entail(sk, skb);
 924                        copy = size_goal;
 925                }
 926
 927                if (copy > size)
 928                        copy = size;
 929
 930                i = skb_shinfo(skb)->nr_frags;
 931                can_coalesce = skb_can_coalesce(skb, i, page, offset);
 932                if (!can_coalesce && i >= sysctl_max_skb_frags) {
 933                        tcp_mark_push(tp, skb);
 934                        goto new_segment;
 935                }
 936                if (!sk_wmem_schedule(sk, copy))
 937                        goto wait_for_memory;
 938
 939                if (can_coalesce) {
 940                        skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
 941                } else {
 942                        get_page(page);
 943                        skb_fill_page_desc(skb, i, page, offset, copy);
 944                }
 945                skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
 946
 947                skb->len += copy;
 948                skb->data_len += copy;
 949                skb->truesize += copy;
 950                sk->sk_wmem_queued += copy;
 951                sk_mem_charge(sk, copy);
 952                skb->ip_summed = CHECKSUM_PARTIAL;
 953                tp->write_seq += copy;
 954                TCP_SKB_CB(skb)->end_seq += copy;
 955                tcp_skb_pcount_set(skb, 0);
 956
 957                if (!copied)
 958                        TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
 959
 960                copied += copy;
 961                offset += copy;
 962                size -= copy;
 963                if (!size) {
 964                        tcp_tx_timestamp(sk, sk->sk_tsflags, skb);
 965                        goto out;
 966                }
 967
 968                if (skb->len < size_goal || (flags & MSG_OOB))
 969                        continue;
 970
 971                if (forced_push(tp)) {
 972                        tcp_mark_push(tp, skb);
 973                        __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
 974                } else if (skb == tcp_send_head(sk))
 975                        tcp_push_one(sk, mss_now);
 976                continue;
 977
 978wait_for_sndbuf:
 979                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 980wait_for_memory:
 981                tcp_push(sk, flags & ~MSG_MORE, mss_now,
 982                         TCP_NAGLE_PUSH, size_goal);
 983
 984                err = sk_stream_wait_memory(sk, &timeo);
 985                if (err != 0)
 986                        goto do_error;
 987
 988                mss_now = tcp_send_mss(sk, &size_goal, flags);
 989        }
 990
 991out:
 992        if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
 993                tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
 994        return copied;
 995
 996do_error:
 997        if (copied)
 998                goto out;
 999out_err:
1000        /* make sure we wake any epoll edge trigger waiter */
1001        if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
1002                sk->sk_write_space(sk);
1003        return sk_stream_error(sk, flags, err);
1004}
1005
1006int tcp_sendpage(struct sock *sk, struct page *page, int offset,
1007                 size_t size, int flags)
1008{
1009        ssize_t res;
1010
1011        if (!(sk->sk_route_caps & NETIF_F_SG) ||
1012            !sk_check_csum_caps(sk))
1013                return sock_no_sendpage(sk->sk_socket, page, offset, size,
1014                                        flags);
1015
1016        lock_sock(sk);
1017
1018        tcp_rate_check_app_limited(sk);  /* is sending application-limited? */
1019
1020        res = do_tcp_sendpages(sk, page, offset, size, flags);
1021        release_sock(sk);
1022        return res;
1023}
1024EXPORT_SYMBOL(tcp_sendpage);
1025
1026/* Do not bother using a page frag for very small frames.
1027 * But use this heuristic only for the first skb in write queue.
1028 *
1029 * Having no payload in skb->head allows better SACK shifting
1030 * in tcp_shift_skb_data(), reducing sack/rack overhead, because
1031 * write queue has less skbs.
1032 * Each skb can hold up to MAX_SKB_FRAGS * 32Kbytes, or ~0.5 MB.
1033 * This also speeds up tso_fragment(), since it wont fallback
1034 * to tcp_fragment().
1035 */
1036static int linear_payload_sz(bool first_skb)
1037{
1038        if (first_skb)
1039                return SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
1040        return 0;
1041}
1042
1043static int select_size(const struct sock *sk, bool sg, bool first_skb)
1044{
1045        const struct tcp_sock *tp = tcp_sk(sk);
1046        int tmp = tp->mss_cache;
1047
1048        if (sg) {
1049                if (sk_can_gso(sk)) {
1050                        tmp = linear_payload_sz(first_skb);
1051                } else {
1052                        int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
1053
1054                        if (tmp >= pgbreak &&
1055                            tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
1056                                tmp = pgbreak;
1057                }
1058        }
1059
1060        return tmp;
1061}
1062
1063void tcp_free_fastopen_req(struct tcp_sock *tp)
1064{
1065        if (tp->fastopen_req) {
1066                kfree(tp->fastopen_req);
1067                tp->fastopen_req = NULL;
1068        }
1069}
1070
1071static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
1072                                int *copied, size_t size)
1073{
1074        struct tcp_sock *tp = tcp_sk(sk);
1075        int err, flags;
1076
1077        if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
1078                return -EOPNOTSUPP;
1079        if (tp->fastopen_req)
1080                return -EALREADY; /* Another Fast Open is in progress */
1081
1082        tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1083                                   sk->sk_allocation);
1084        if (unlikely(!tp->fastopen_req))
1085                return -ENOBUFS;
1086        tp->fastopen_req->data = msg;
1087        tp->fastopen_req->size = size;
1088
1089        flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1090        err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
1091                                    msg->msg_namelen, flags);
1092        *copied = tp->fastopen_req->copied;
1093        tcp_free_fastopen_req(tp);
1094        return err;
1095}
1096
1097int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
1098{
1099        struct tcp_sock *tp = tcp_sk(sk);
1100        struct sk_buff *skb;
1101        struct sockcm_cookie sockc;
1102        int flags, err, copied = 0;
1103        int mss_now = 0, size_goal, copied_syn = 0;
1104        bool process_backlog = false;
1105        bool sg;
1106        long timeo;
1107
1108        lock_sock(sk);
1109
1110        flags = msg->msg_flags;
1111        if (flags & MSG_FASTOPEN) {
1112                err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
1113                if (err == -EINPROGRESS && copied_syn > 0)
1114                        goto out;
1115                else if (err)
1116                        goto out_err;
1117        }
1118
1119        timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1120
1121        tcp_rate_check_app_limited(sk);  /* is sending application-limited? */
1122
1123        /* Wait for a connection to finish. One exception is TCP Fast Open
1124         * (passive side) where data is allowed to be sent before a connection
1125         * is fully established.
1126         */
1127        if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1128            !tcp_passive_fastopen(sk)) {
1129                err = sk_stream_wait_connect(sk, &timeo);
1130                if (err != 0)
1131                        goto do_error;
1132        }
1133
1134        if (unlikely(tp->repair)) {
1135                if (tp->repair_queue == TCP_RECV_QUEUE) {
1136                        copied = tcp_send_rcvq(sk, msg, size);
1137                        goto out_nopush;
1138                }
1139
1140                err = -EINVAL;
1141                if (tp->repair_queue == TCP_NO_QUEUE)
1142                        goto out_err;
1143
1144                /* 'common' sending to sendq */
1145        }
1146
1147        sockc.tsflags = sk->sk_tsflags;
1148        if (msg->msg_controllen) {
1149                err = sock_cmsg_send(sk, msg, &sockc);
1150                if (unlikely(err)) {
1151                        err = -EINVAL;
1152                        goto out_err;
1153                }
1154        }
1155
1156        /* This should be in poll */
1157        sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1158
1159        /* Ok commence sending. */
1160        copied = 0;
1161
1162restart:
1163        mss_now = tcp_send_mss(sk, &size_goal, flags);
1164
1165        err = -EPIPE;
1166        if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1167                goto do_error;
1168
1169        sg = !!(sk->sk_route_caps & NETIF_F_SG);
1170
1171        while (msg_data_left(msg)) {
1172                int copy = 0;
1173                int max = size_goal;
1174
1175                skb = tcp_write_queue_tail(sk);
1176                if (tcp_send_head(sk)) {
1177                        if (skb->ip_summed == CHECKSUM_NONE)
1178                                max = mss_now;
1179                        copy = max - skb->len;
1180                }
1181
1182                if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
1183                        bool first_skb;
1184
1185new_segment:
1186                        /* Allocate new segment. If the interface is SG,
1187                         * allocate skb fitting to single page.
1188                         */
1189                        if (!sk_stream_memory_free(sk))
1190                                goto wait_for_sndbuf;
1191
1192                        if (process_backlog && sk_flush_backlog(sk)) {
1193                                process_backlog = false;
1194                                goto restart;
1195                        }
1196                        first_skb = skb_queue_empty(&sk->sk_write_queue);
1197                        skb = sk_stream_alloc_skb(sk,
1198                                                  select_size(sk, sg, first_skb),
1199                                                  sk->sk_allocation,
1200                                                  first_skb);
1201                        if (!skb)
1202                                goto wait_for_memory;
1203
1204                        process_backlog = true;
1205                        /*
1206                         * Check whether we can use HW checksum.
1207                         */
1208                        if (sk_check_csum_caps(sk))
1209                                skb->ip_summed = CHECKSUM_PARTIAL;
1210
1211                        skb_entail(sk, skb);
1212                        copy = size_goal;
1213                        max = size_goal;
1214
1215                        /* All packets are restored as if they have
1216                         * already been sent. skb_mstamp isn't set to
1217                         * avoid wrong rtt estimation.
1218                         */
1219                        if (tp->repair)
1220                                TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
1221                }
1222
1223                /* Try to append data to the end of skb. */
1224                if (copy > msg_data_left(msg))
1225                        copy = msg_data_left(msg);
1226
1227                /* Where to copy to? */
1228                if (skb_availroom(skb) > 0) {
1229                        /* We have some space in skb head. Superb! */
1230                        copy = min_t(int, copy, skb_availroom(skb));
1231                        err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
1232                        if (err)
1233                                goto do_fault;
1234                } else {
1235                        bool merge = true;
1236                        int i = skb_shinfo(skb)->nr_frags;
1237                        struct page_frag *pfrag = sk_page_frag(sk);
1238
1239                        if (!sk_page_frag_refill(sk, pfrag))
1240                                goto wait_for_memory;
1241
1242                        if (!skb_can_coalesce(skb, i, pfrag->page,
1243                                              pfrag->offset)) {
1244                                if (i >= sysctl_max_skb_frags || !sg) {
1245                                        tcp_mark_push(tp, skb);
1246                                        goto new_segment;
1247                                }
1248                                merge = false;
1249                        }
1250
1251                        copy = min_t(int, copy, pfrag->size - pfrag->offset);
1252
1253                        if (!sk_wmem_schedule(sk, copy))
1254                                goto wait_for_memory;
1255
1256                        err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
1257                                                       pfrag->page,
1258                                                       pfrag->offset,
1259                                                       copy);
1260                        if (err)
1261                                goto do_error;
1262
1263                        /* Update the skb. */
1264                        if (merge) {
1265                                skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1266                        } else {
1267                                skb_fill_page_desc(skb, i, pfrag->page,
1268                                                   pfrag->offset, copy);
1269                                get_page(pfrag->page);
1270                        }
1271                        pfrag->offset += copy;
1272                }
1273
1274                if (!copied)
1275                        TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1276
1277                tp->write_seq += copy;
1278                TCP_SKB_CB(skb)->end_seq += copy;
1279                tcp_skb_pcount_set(skb, 0);
1280
1281                copied += copy;
1282                if (!msg_data_left(msg)) {
1283                        tcp_tx_timestamp(sk, sockc.tsflags, skb);
1284                        if (unlikely(flags & MSG_EOR))
1285                                TCP_SKB_CB(skb)->eor = 1;
1286                        goto out;
1287                }
1288
1289                if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
1290                        continue;
1291
1292                if (forced_push(tp)) {
1293                        tcp_mark_push(tp, skb);
1294                        __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1295                } else if (skb == tcp_send_head(sk))
1296                        tcp_push_one(sk, mss_now);
1297                continue;
1298
1299wait_for_sndbuf:
1300                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1301wait_for_memory:
1302                if (copied)
1303                        tcp_push(sk, flags & ~MSG_MORE, mss_now,
1304                                 TCP_NAGLE_PUSH, size_goal);
1305
1306                err = sk_stream_wait_memory(sk, &timeo);
1307                if (err != 0)
1308                        goto do_error;
1309
1310                mss_now = tcp_send_mss(sk, &size_goal, flags);
1311        }
1312
1313out:
1314        if (copied)
1315                tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1316out_nopush:
1317        release_sock(sk);
1318        return copied + copied_syn;
1319
1320do_fault:
1321        if (!skb->len) {
1322                tcp_unlink_write_queue(skb, sk);
1323                /* It is the one place in all of TCP, except connection
1324                 * reset, where we can be unlinking the send_head.
1325                 */
1326                tcp_check_send_head(sk, skb);
1327                sk_wmem_free_skb(sk, skb);
1328        }
1329
1330do_error:
1331        if (copied + copied_syn)
1332                goto out;
1333out_err:
1334        err = sk_stream_error(sk, flags, err);
1335        /* make sure we wake any epoll edge trigger waiter */
1336        if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
1337                sk->sk_write_space(sk);
1338        release_sock(sk);
1339        return err;
1340}
1341EXPORT_SYMBOL(tcp_sendmsg);
1342
1343/*
1344 *      Handle reading urgent data. BSD has very simple semantics for
1345 *      this, no blocking and very strange errors 8)
1346 */
1347
1348static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1349{
1350        struct tcp_sock *tp = tcp_sk(sk);
1351
1352        /* No URG data to read. */
1353        if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1354            tp->urg_data == TCP_URG_READ)
1355                return -EINVAL; /* Yes this is right ! */
1356
1357        if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1358                return -ENOTCONN;
1359
1360        if (tp->urg_data & TCP_URG_VALID) {
1361                int err = 0;
1362                char c = tp->urg_data;
1363
1364                if (!(flags & MSG_PEEK))
1365                        tp->urg_data = TCP_URG_READ;
1366
1367                /* Read urgent data. */
1368                msg->msg_flags |= MSG_OOB;
1369
1370                if (len > 0) {
1371                        if (!(flags & MSG_TRUNC))
1372                                err = memcpy_to_msg(msg, &c, 1);
1373                        len = 1;
1374                } else
1375                        msg->msg_flags |= MSG_TRUNC;
1376
1377                return err ? -EFAULT : len;
1378        }
1379
1380        if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1381                return 0;
1382
1383        /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1384         * the available implementations agree in this case:
1385         * this call should never block, independent of the
1386         * blocking state of the socket.
1387         * Mike <pall@rz.uni-karlsruhe.de>
1388         */
1389        return -EAGAIN;
1390}
1391
1392static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1393{
1394        struct sk_buff *skb;
1395        int copied = 0, err = 0;
1396
1397        /* XXX -- need to support SO_PEEK_OFF */
1398
1399        skb_queue_walk(&sk->sk_write_queue, skb) {
1400                err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1401                if (err)
1402                        break;
1403
1404                copied += skb->len;
1405        }
1406
1407        return err ?: copied;
1408}
1409
1410/* Clean up the receive buffer for full frames taken by the user,
1411 * then send an ACK if necessary.  COPIED is the number of bytes
1412 * tcp_recvmsg has given to the user so far, it speeds up the
1413 * calculation of whether or not we must ACK for the sake of
1414 * a window update.
1415 */
1416static void tcp_cleanup_rbuf(struct sock *sk, int copied)
1417{
1418        struct tcp_sock *tp = tcp_sk(sk);
1419        bool time_to_ack = false;
1420
1421        struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1422
1423        WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1424             "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1425             tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1426
1427        if (inet_csk_ack_scheduled(sk)) {
1428                const struct inet_connection_sock *icsk = inet_csk(sk);
1429                   /* Delayed ACKs frequently hit locked sockets during bulk
1430                    * receive. */
1431                if (icsk->icsk_ack.blocked ||
1432                    /* Once-per-two-segments ACK was not sent by tcp_input.c */
1433                    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1434                    /*
1435                     * If this read emptied read buffer, we send ACK, if
1436                     * connection is not bidirectional, user drained
1437                     * receive buffer and there was a small segment
1438                     * in queue.
1439                     */
1440                    (copied > 0 &&
1441                     ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1442                      ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1443                       !icsk->icsk_ack.pingpong)) &&
1444                      !atomic_read(&sk->sk_rmem_alloc)))
1445                        time_to_ack = true;
1446        }
1447
1448        /* We send an ACK if we can now advertise a non-zero window
1449         * which has been raised "significantly".
1450         *
1451         * Even if window raised up to infinity, do not send window open ACK
1452         * in states, where we will not receive more. It is useless.
1453         */
1454        if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1455                __u32 rcv_window_now = tcp_receive_window(tp);
1456
1457                /* Optimize, __tcp_select_window() is not cheap. */
1458                if (2*rcv_window_now <= tp->window_clamp) {
1459                        __u32 new_window = __tcp_select_window(sk);
1460
1461                        /* Send ACK now, if this read freed lots of space
1462                         * in our buffer. Certainly, new_window is new window.
1463                         * We can advertise it now, if it is not less than current one.
1464                         * "Lots" means "at least twice" here.
1465                         */
1466                        if (new_window && new_window >= 2 * rcv_window_now)
1467                                time_to_ack = true;
1468                }
1469        }
1470        if (time_to_ack)
1471                tcp_send_ack(sk);
1472}
1473
1474static void tcp_prequeue_process(struct sock *sk)
1475{
1476        struct sk_buff *skb;
1477        struct tcp_sock *tp = tcp_sk(sk);
1478
1479        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
1480
1481        while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1482                sk_backlog_rcv(sk, skb);
1483
1484        /* Clear memory counter. */
1485        tp->ucopy.memory = 0;
1486}
1487
1488static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1489{
1490        struct sk_buff *skb;
1491        u32 offset;
1492
1493        while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1494                offset = seq - TCP_SKB_CB(skb)->seq;
1495                if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
1496                        pr_err_once("%s: found a SYN, please report !\n", __func__);
1497                        offset--;
1498                }
1499                if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
1500                        *off = offset;
1501                        return skb;
1502                }
1503                /* This looks weird, but this can happen if TCP collapsing
1504                 * splitted a fat GRO packet, while we released socket lock
1505                 * in skb_splice_bits()
1506                 */
1507                sk_eat_skb(sk, skb);
1508        }
1509        return NULL;
1510}
1511
1512/*
1513 * This routine provides an alternative to tcp_recvmsg() for routines
1514 * that would like to handle copying from skbuffs directly in 'sendfile'
1515 * fashion.
1516 * Note:
1517 *      - It is assumed that the socket was locked by the caller.
1518 *      - The routine does not block.
1519 *      - At present, there is no support for reading OOB data
1520 *        or for 'peeking' the socket using this routine
1521 *        (although both would be easy to implement).
1522 */
1523int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1524                  sk_read_actor_t recv_actor)
1525{
1526        struct sk_buff *skb;
1527        struct tcp_sock *tp = tcp_sk(sk);
1528        u32 seq = tp->copied_seq;
1529        u32 offset;
1530        int copied = 0;
1531
1532        if (sk->sk_state == TCP_LISTEN)
1533                return -ENOTCONN;
1534        while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1535                if (offset < skb->len) {
1536                        int used;
1537                        size_t len;
1538
1539                        len = skb->len - offset;
1540                        /* Stop reading if we hit a patch of urgent data */
1541                        if (tp->urg_data) {
1542                                u32 urg_offset = tp->urg_seq - seq;
1543                                if (urg_offset < len)
1544                                        len = urg_offset;
1545                                if (!len)
1546                                        break;
1547                        }
1548                        used = recv_actor(desc, skb, offset, len);
1549                        if (used <= 0) {
1550                                if (!copied)
1551                                        copied = used;
1552                                break;
1553                        } else if (used <= len) {
1554                                seq += used;
1555                                copied += used;
1556                                offset += used;
1557                        }
1558                        /* If recv_actor drops the lock (e.g. TCP splice
1559                         * receive) the skb pointer might be invalid when
1560                         * getting here: tcp_collapse might have deleted it
1561                         * while aggregating skbs from the socket queue.
1562                         */
1563                        skb = tcp_recv_skb(sk, seq - 1, &offset);
1564                        if (!skb)
1565                                break;
1566                        /* TCP coalescing might have appended data to the skb.
1567                         * Try to splice more frags
1568                         */
1569                        if (offset + 1 != skb->len)
1570                                continue;
1571                }
1572                if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
1573                        sk_eat_skb(sk, skb);
1574                        ++seq;
1575                        break;
1576                }
1577                sk_eat_skb(sk, skb);
1578                if (!desc->count)
1579                        break;
1580                tp->copied_seq = seq;
1581        }
1582        tp->copied_seq = seq;
1583
1584        tcp_rcv_space_adjust(sk);
1585
1586        /* Clean up data we have read: This will do ACK frames. */
1587        if (copied > 0) {
1588                tcp_recv_skb(sk, seq, &offset);
1589                tcp_cleanup_rbuf(sk, copied);
1590        }
1591        return copied;
1592}
1593EXPORT_SYMBOL(tcp_read_sock);
1594
1595int tcp_peek_len(struct socket *sock)
1596{
1597        return tcp_inq(sock->sk);
1598}
1599EXPORT_SYMBOL(tcp_peek_len);
1600
1601/*
1602 *      This routine copies from a sock struct into the user buffer.
1603 *
1604 *      Technical note: in 2.3 we work on _locked_ socket, so that
1605 *      tricks with *seq access order and skb->users are not required.
1606 *      Probably, code can be easily improved even more.
1607 */
1608
1609int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
1610                int flags, int *addr_len)
1611{
1612        struct tcp_sock *tp = tcp_sk(sk);
1613        int copied = 0;
1614        u32 peek_seq;
1615        u32 *seq;
1616        unsigned long used;
1617        int err;
1618        int target;             /* Read at least this many bytes */
1619        long timeo;
1620        struct task_struct *user_recv = NULL;
1621        struct sk_buff *skb, *last;
1622        u32 urg_hole = 0;
1623
1624        if (unlikely(flags & MSG_ERRQUEUE))
1625                return inet_recv_error(sk, msg, len, addr_len);
1626
1627        if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
1628            (sk->sk_state == TCP_ESTABLISHED))
1629                sk_busy_loop(sk, nonblock);
1630
1631        lock_sock(sk);
1632
1633        err = -ENOTCONN;
1634        if (sk->sk_state == TCP_LISTEN)
1635                goto out;
1636
1637        timeo = sock_rcvtimeo(sk, nonblock);
1638
1639        /* Urgent data needs to be handled specially. */
1640        if (flags & MSG_OOB)
1641                goto recv_urg;
1642
1643        if (unlikely(tp->repair)) {
1644                err = -EPERM;
1645                if (!(flags & MSG_PEEK))
1646                        goto out;
1647
1648                if (tp->repair_queue == TCP_SEND_QUEUE)
1649                        goto recv_sndq;
1650
1651                err = -EINVAL;
1652                if (tp->repair_queue == TCP_NO_QUEUE)
1653                        goto out;
1654
1655                /* 'common' recv queue MSG_PEEK-ing */
1656        }
1657
1658        seq = &tp->copied_seq;
1659        if (flags & MSG_PEEK) {
1660                peek_seq = tp->copied_seq;
1661                seq = &peek_seq;
1662        }
1663
1664        target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1665
1666        do {
1667                u32 offset;
1668
1669                /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1670                if (tp->urg_data && tp->urg_seq == *seq) {
1671                        if (copied)
1672                                break;
1673                        if (signal_pending(current)) {
1674                                copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1675                                break;
1676                        }
1677                }
1678
1679                /* Next get a buffer. */
1680
1681                last = skb_peek_tail(&sk->sk_receive_queue);
1682                skb_queue_walk(&sk->sk_receive_queue, skb) {
1683                        last = skb;
1684                        /* Now that we have two receive queues this
1685                         * shouldn't happen.
1686                         */
1687                        if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
1688                                 "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
1689                                 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
1690                                 flags))
1691                                break;
1692
1693                        offset = *seq - TCP_SKB_CB(skb)->seq;
1694                        if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
1695                                pr_err_once("%s: found a SYN, please report !\n", __func__);
1696                                offset--;
1697                        }
1698                        if (offset < skb->len)
1699                                goto found_ok_skb;
1700                        if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
1701                                goto found_fin_ok;
1702                        WARN(!(flags & MSG_PEEK),
1703                             "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
1704                             *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
1705                }
1706
1707                /* Well, if we have backlog, try to process it now yet. */
1708
1709                if (copied >= target && !sk->sk_backlog.tail)
1710                        break;
1711
1712                if (copied) {
1713                        if (sk->sk_err ||
1714                            sk->sk_state == TCP_CLOSE ||
1715                            (sk->sk_shutdown & RCV_SHUTDOWN) ||
1716                            !timeo ||
1717                            signal_pending(current))
1718                                break;
1719                } else {
1720                        if (sock_flag(sk, SOCK_DONE))
1721                                break;
1722
1723                        if (sk->sk_err) {
1724                                copied = sock_error(sk);
1725                                break;
1726                        }
1727
1728                        if (sk->sk_shutdown & RCV_SHUTDOWN)
1729                                break;
1730
1731                        if (sk->sk_state == TCP_CLOSE) {
1732                                if (!sock_flag(sk, SOCK_DONE)) {
1733                                        /* This occurs when user tries to read
1734                                         * from never connected socket.
1735                                         */
1736                                        copied = -ENOTCONN;
1737                                        break;
1738                                }
1739                                break;
1740                        }
1741
1742                        if (!timeo) {
1743                                copied = -EAGAIN;
1744                                break;
1745                        }
1746
1747                        if (signal_pending(current)) {
1748                                copied = sock_intr_errno(timeo);
1749                                break;
1750                        }
1751                }
1752
1753                tcp_cleanup_rbuf(sk, copied);
1754
1755                if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1756                        /* Install new reader */
1757                        if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1758                                user_recv = current;
1759                                tp->ucopy.task = user_recv;
1760                                tp->ucopy.msg = msg;
1761                        }
1762
1763                        tp->ucopy.len = len;
1764
1765                        WARN_ON(tp->copied_seq != tp->rcv_nxt &&
1766                                !(flags & (MSG_PEEK | MSG_TRUNC)));
1767
1768                        /* Ugly... If prequeue is not empty, we have to
1769                         * process it before releasing socket, otherwise
1770                         * order will be broken at second iteration.
1771                         * More elegant solution is required!!!
1772                         *
1773                         * Look: we have the following (pseudo)queues:
1774                         *
1775                         * 1. packets in flight
1776                         * 2. backlog
1777                         * 3. prequeue
1778                         * 4. receive_queue
1779                         *
1780                         * Each queue can be processed only if the next ones
1781                         * are empty. At this point we have empty receive_queue.
1782                         * But prequeue _can_ be not empty after 2nd iteration,
1783                         * when we jumped to start of loop because backlog
1784                         * processing added something to receive_queue.
1785                         * We cannot release_sock(), because backlog contains
1786                         * packets arrived _after_ prequeued ones.
1787                         *
1788                         * Shortly, algorithm is clear --- to process all
1789                         * the queues in order. We could make it more directly,
1790                         * requeueing packets from backlog to prequeue, if
1791                         * is not empty. It is more elegant, but eats cycles,
1792                         * unfortunately.
1793                         */
1794                        if (!skb_queue_empty(&tp->ucopy.prequeue))
1795                                goto do_prequeue;
1796
1797                        /* __ Set realtime policy in scheduler __ */
1798                }
1799
1800                if (copied >= target) {
1801                        /* Do not sleep, just process backlog. */
1802                        release_sock(sk);
1803                        lock_sock(sk);
1804                } else {
1805                        sk_wait_data(sk, &timeo, last);
1806                }
1807
1808                if (user_recv) {
1809                        int chunk;
1810
1811                        /* __ Restore normal policy in scheduler __ */
1812
1813                        chunk = len - tp->ucopy.len;
1814                        if (chunk != 0) {
1815                                NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1816                                len -= chunk;
1817                                copied += chunk;
1818                        }
1819
1820                        if (tp->rcv_nxt == tp->copied_seq &&
1821                            !skb_queue_empty(&tp->ucopy.prequeue)) {
1822do_prequeue:
1823                                tcp_prequeue_process(sk);
1824
1825                                chunk = len - tp->ucopy.len;
1826                                if (chunk != 0) {
1827                                        NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1828                                        len -= chunk;
1829                                        copied += chunk;
1830                                }
1831                        }
1832                }
1833                if ((flags & MSG_PEEK) &&
1834                    (peek_seq - copied - urg_hole != tp->copied_seq)) {
1835                        net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
1836                                            current->comm,
1837                                            task_pid_nr(current));
1838                        peek_seq = tp->copied_seq;
1839                }
1840                continue;
1841
1842        found_ok_skb:
1843                /* Ok so how much can we use? */
1844                used = skb->len - offset;
1845                if (len < used)
1846                        used = len;
1847
1848                /* Do we have urgent data here? */
1849                if (tp->urg_data) {
1850                        u32 urg_offset = tp->urg_seq - *seq;
1851                        if (urg_offset < used) {
1852                                if (!urg_offset) {
1853                                        if (!sock_flag(sk, SOCK_URGINLINE)) {
1854                                                ++*seq;
1855                                                urg_hole++;
1856                                                offset++;
1857                                                used--;
1858                                                if (!used)
1859                                                        goto skip_copy;
1860                                        }
1861                                } else
1862                                        used = urg_offset;
1863                        }
1864                }
1865
1866                if (!(flags & MSG_TRUNC)) {
1867                        err = skb_copy_datagram_msg(skb, offset, msg, used);
1868                        if (err) {
1869                                /* Exception. Bailout! */
1870                                if (!copied)
1871                                        copied = -EFAULT;
1872                                break;
1873                        }
1874                }
1875
1876                *seq += used;
1877                copied += used;
1878                len -= used;
1879
1880                tcp_rcv_space_adjust(sk);
1881
1882skip_copy:
1883                if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1884                        tp->urg_data = 0;
1885                        tcp_fast_path_check(sk);
1886                }
1887                if (used + offset < skb->len)
1888                        continue;
1889
1890                if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
1891                        goto found_fin_ok;
1892                if (!(flags & MSG_PEEK))
1893                        sk_eat_skb(sk, skb);
1894                continue;
1895
1896        found_fin_ok:
1897                /* Process the FIN. */
1898                ++*seq;
1899                if (!(flags & MSG_PEEK))
1900                        sk_eat_skb(sk, skb);
1901                break;
1902        } while (len > 0);
1903
1904        if (user_recv) {
1905                if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1906                        int chunk;
1907
1908                        tp->ucopy.len = copied > 0 ? len : 0;
1909
1910                        tcp_prequeue_process(sk);
1911
1912                        if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1913                                NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1914                                len -= chunk;
1915                                copied += chunk;
1916                        }
1917                }
1918
1919                tp->ucopy.task = NULL;
1920                tp->ucopy.len = 0;
1921        }
1922
1923        /* According to UNIX98, msg_name/msg_namelen are ignored
1924         * on connected socket. I was just happy when found this 8) --ANK
1925         */
1926
1927        /* Clean up data we have read: This will do ACK frames. */
1928        tcp_cleanup_rbuf(sk, copied);
1929
1930        release_sock(sk);
1931        return copied;
1932
1933out:
1934        release_sock(sk);
1935        return err;
1936
1937recv_urg:
1938        err = tcp_recv_urg(sk, msg, len, flags);
1939        goto out;
1940
1941recv_sndq:
1942        err = tcp_peek_sndq(sk, msg, len);
1943        goto out;
1944}
1945EXPORT_SYMBOL(tcp_recvmsg);
1946
1947void tcp_set_state(struct sock *sk, int state)
1948{
1949        int oldstate = sk->sk_state;
1950
1951        switch (state) {
1952        case TCP_ESTABLISHED:
1953                if (oldstate != TCP_ESTABLISHED)
1954                        TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1955                break;
1956
1957        case TCP_CLOSE:
1958                if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
1959                        TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
1960
1961                sk->sk_prot->unhash(sk);
1962                if (inet_csk(sk)->icsk_bind_hash &&
1963                    !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
1964                        inet_put_port(sk);
1965                /* fall through */
1966        default:
1967                if (oldstate == TCP_ESTABLISHED)
1968                        TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1969        }
1970
1971        /* Change state AFTER socket is unhashed to avoid closed
1972         * socket sitting in hash tables.
1973         */
1974        sk_state_store(sk, state);
1975
1976#ifdef STATE_TRACE
1977        SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
1978#endif
1979}
1980EXPORT_SYMBOL_GPL(tcp_set_state);
1981
1982/*
1983 *      State processing on a close. This implements the state shift for
1984 *      sending our FIN frame. Note that we only send a FIN for some
1985 *      states. A shutdown() may have already sent the FIN, or we may be
1986 *      closed.
1987 */
1988
1989static const unsigned char new_state[16] = {
1990  /* current state:        new state:      action:      */
1991  [0 /* (Invalid) */]   = TCP_CLOSE,
1992  [TCP_ESTABLISHED]     = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1993  [TCP_SYN_SENT]        = TCP_CLOSE,
1994  [TCP_SYN_RECV]        = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1995  [TCP_FIN_WAIT1]       = TCP_FIN_WAIT1,
1996  [TCP_FIN_WAIT2]       = TCP_FIN_WAIT2,
1997  [TCP_TIME_WAIT]       = TCP_CLOSE,
1998  [TCP_CLOSE]           = TCP_CLOSE,
1999  [TCP_CLOSE_WAIT]      = TCP_LAST_ACK  | TCP_ACTION_FIN,
2000  [TCP_LAST_ACK]        = TCP_LAST_ACK,
2001  [TCP_LISTEN]          = TCP_CLOSE,
2002  [TCP_CLOSING]         = TCP_CLOSING,
2003  [TCP_NEW_SYN_RECV]    = TCP_CLOSE,    /* should not happen ! */
2004};
2005
2006static int tcp_close_state(struct sock *sk)
2007{
2008        int next = (int)new_state[sk->sk_state];
2009        int ns = next & TCP_STATE_MASK;
2010
2011        tcp_set_state(sk, ns);
2012
2013        return next & TCP_ACTION_FIN;
2014}
2015
2016/*
2017 *      Shutdown the sending side of a connection. Much like close except
2018 *      that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD).
2019 */
2020
2021void tcp_shutdown(struct sock *sk, int how)
2022{
2023        /*      We need to grab some memory, and put together a FIN,
2024         *      and then put it into the queue to be sent.
2025         *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2026         */
2027        if (!(how & SEND_SHUTDOWN))
2028                return;
2029
2030        /* If we've already sent a FIN, or it's a closed state, skip this. */
2031        if ((1 << sk->sk_state) &
2032            (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2033             TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2034                /* Clear out any half completed packets.  FIN if needed. */
2035                if (tcp_close_state(sk))
2036                        tcp_send_fin(sk);
2037        }
2038}
2039EXPORT_SYMBOL(tcp_shutdown);
2040
2041bool tcp_check_oom(struct sock *sk, int shift)
2042{
2043        bool too_many_orphans, out_of_socket_memory;
2044
2045        too_many_orphans = tcp_too_many_orphans(sk, shift);
2046        out_of_socket_memory = tcp_out_of_memory(sk);
2047
2048        if (too_many_orphans)
2049                net_info_ratelimited("too many orphaned sockets\n");
2050        if (out_of_socket_memory)
2051                net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2052        return too_many_orphans || out_of_socket_memory;
2053}
2054
2055void tcp_close(struct sock *sk, long timeout)
2056{
2057        struct sk_buff *skb;
2058        int data_was_unread = 0;
2059        int state;
2060
2061        lock_sock(sk);
2062        sk->sk_shutdown = SHUTDOWN_MASK;
2063
2064        if (sk->sk_state == TCP_LISTEN) {
2065                tcp_set_state(sk, TCP_CLOSE);
2066
2067                /* Special case. */
2068                inet_csk_listen_stop(sk);
2069
2070                goto adjudge_to_death;
2071        }
2072
2073        /*  We need to flush the recv. buffs.  We do this only on the
2074         *  descriptor close, not protocol-sourced closes, because the
2075         *  reader process may not have drained the data yet!
2076         */
2077        while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2078                u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
2079
2080                if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2081                        len--;
2082                data_was_unread += len;
2083                __kfree_skb(skb);
2084        }
2085
2086        sk_mem_reclaim(sk);
2087
2088        /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
2089        if (sk->sk_state == TCP_CLOSE)
2090                goto adjudge_to_death;
2091
2092        /* As outlined in RFC 2525, section 2.17, we send a RST here because
2093         * data was lost. To witness the awful effects of the old behavior of
2094         * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
2095         * GET in an FTP client, suspend the process, wait for the client to
2096         * advertise a zero window, then kill -9 the FTP client, wheee...
2097         * Note: timeout is always zero in such a case.
2098         */
2099        if (unlikely(tcp_sk(sk)->repair)) {
2100                sk->sk_prot->disconnect(sk, 0);
2101        } else if (data_was_unread) {
2102                /* Unread data was tossed, zap the connection. */
2103                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2104                tcp_set_state(sk, TCP_CLOSE);
2105                tcp_send_active_reset(sk, sk->sk_allocation);
2106        } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2107                /* Check zero linger _after_ checking for unread data. */
2108                sk->sk_prot->disconnect(sk, 0);
2109                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
2110        } else if (tcp_close_state(sk)) {
2111                /* We FIN if the application ate all the data before
2112                 * zapping the connection.
2113                 */
2114
2115                /* RED-PEN. Formally speaking, we have broken TCP state
2116                 * machine. State transitions:
2117                 *
2118                 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
2119                 * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
2120                 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
2121                 *
2122                 * are legal only when FIN has been sent (i.e. in window),
2123                 * rather than queued out of window. Purists blame.
2124                 *
2125                 * F.e. "RFC state" is ESTABLISHED,
2126                 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
2127                 *
2128                 * The visible declinations are that sometimes
2129                 * we enter time-wait state, when it is not required really
2130                 * (harmless), do not send active resets, when they are
2131                 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
2132                 * they look as CLOSING or LAST_ACK for Linux)
2133                 * Probably, I missed some more holelets.
2134                 *                                              --ANK
2135                 * XXX (TFO) - To start off we don't support SYN+ACK+FIN
2136                 * in a single packet! (May consider it later but will
2137                 * probably need API support or TCP_CORK SYN-ACK until
2138                 * data is written and socket is closed.)
2139                 */
2140                tcp_send_fin(sk);
2141        }
2142
2143        sk_stream_wait_close(sk, timeout);
2144
2145adjudge_to_death:
2146        state = sk->sk_state;
2147        sock_hold(sk);
2148        sock_orphan(sk);
2149
2150        /* It is the last release_sock in its life. It will remove backlog. */
2151        release_sock(sk);
2152
2153
2154        /* Now socket is owned by kernel and we acquire BH lock
2155           to finish close. No need to check for user refs.
2156         */
2157        local_bh_disable();
2158        bh_lock_sock(sk);
2159        WARN_ON(sock_owned_by_user(sk));
2160
2161        percpu_counter_inc(sk->sk_prot->orphan_count);
2162
2163        /* Have we already been destroyed by a softirq or backlog? */
2164        if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2165                goto out;
2166
2167        /*      This is a (useful) BSD violating of the RFC. There is a
2168         *      problem with TCP as specified in that the other end could
2169         *      keep a socket open forever with no application left this end.
2170         *      We use a 1 minute timeout (about the same as BSD) then kill
2171         *      our end. If they send after that then tough - BUT: long enough
2172         *      that we won't make the old 4*rto = almost no time - whoops
2173         *      reset mistake.
2174         *
2175         *      Nope, it was not mistake. It is really desired behaviour
2176         *      f.e. on http servers, when such sockets are useless, but
2177         *      consume significant resources. Let's do it with special
2178         *      linger2 option.                                 --ANK
2179         */
2180
2181        if (sk->sk_state == TCP_FIN_WAIT2) {
2182                struct tcp_sock *tp = tcp_sk(sk);
2183                if (tp->linger2 < 0) {
2184                        tcp_set_state(sk, TCP_CLOSE);
2185                        tcp_send_active_reset(sk, GFP_ATOMIC);
2186                        __NET_INC_STATS(sock_net(sk),
2187                                        LINUX_MIB_TCPABORTONLINGER);
2188                } else {
2189                        const int tmo = tcp_fin_time(sk);
2190
2191                        if (tmo > TCP_TIMEWAIT_LEN) {
2192                                inet_csk_reset_keepalive_timer(sk,
2193                                                tmo - TCP_TIMEWAIT_LEN);
2194                        } else {
2195                                tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2196                                goto out;
2197                        }
2198                }
2199        }
2200        if (sk->sk_state != TCP_CLOSE) {
2201                sk_mem_reclaim(sk);
2202                if (tcp_check_oom(sk, 0)) {
2203                        tcp_set_state(sk, TCP_CLOSE);
2204                        tcp_send_active_reset(sk, GFP_ATOMIC);
2205                        __NET_INC_STATS(sock_net(sk),
2206                                        LINUX_MIB_TCPABORTONMEMORY);
2207                }
2208        }
2209
2210        if (sk->sk_state == TCP_CLOSE) {
2211                struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
2212                /* We could get here with a non-NULL req if the socket is
2213                 * aborted (e.g., closed with unread data) before 3WHS
2214                 * finishes.
2215                 */
2216                if (req)
2217                        reqsk_fastopen_remove(sk, req, false);
2218                inet_csk_destroy_sock(sk);
2219        }
2220        /* Otherwise, socket is reprieved until protocol close. */
2221
2222out:
2223        bh_unlock_sock(sk);
2224        local_bh_enable();
2225        sock_put(sk);
2226}
2227EXPORT_SYMBOL(tcp_close);
2228
2229/* These states need RST on ABORT according to RFC793 */
2230
2231static inline bool tcp_need_reset(int state)
2232{
2233        return (1 << state) &
2234               (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2235                TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2236}
2237
2238int tcp_disconnect(struct sock *sk, int flags)
2239{
2240        struct inet_sock *inet = inet_sk(sk);
2241        struct inet_connection_sock *icsk = inet_csk(sk);
2242        struct tcp_sock *tp = tcp_sk(sk);
2243        int err = 0;
2244        int old_state = sk->sk_state;
2245
2246        if (old_state != TCP_CLOSE)
2247                tcp_set_state(sk, TCP_CLOSE);
2248
2249        /* ABORT function of RFC793 */
2250        if (old_state == TCP_LISTEN) {
2251                inet_csk_listen_stop(sk);
2252        } else if (unlikely(tp->repair)) {
2253                sk->sk_err = ECONNABORTED;
2254        } else if (tcp_need_reset(old_state) ||
2255                   (tp->snd_nxt != tp->write_seq &&
2256                    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2257                /* The last check adjusts for discrepancy of Linux wrt. RFC
2258                 * states
2259                 */
2260                tcp_send_active_reset(sk, gfp_any());
2261                sk->sk_err = ECONNRESET;
2262        } else if (old_state == TCP_SYN_SENT)
2263                sk->sk_err = ECONNRESET;
2264
2265        tcp_clear_xmit_timers(sk);
2266        __skb_queue_purge(&sk->sk_receive_queue);
2267        tcp_write_queue_purge(sk);
2268        skb_rbtree_purge(&tp->out_of_order_queue);
2269
2270        inet->inet_dport = 0;
2271
2272        if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2273                inet_reset_saddr(sk);
2274
2275        sk->sk_shutdown = 0;
2276        sock_reset_flag(sk, SOCK_DONE);
2277        tp->srtt_us = 0;
2278        tp->write_seq += tp->max_window + 2;
2279        if (tp->write_seq == 0)
2280                tp->write_seq = 1;
2281        icsk->icsk_backoff = 0;
2282        tp->snd_cwnd = 2;
2283        icsk->icsk_probes_out = 0;
2284        tp->packets_out = 0;
2285        tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2286        tp->snd_cwnd_cnt = 0;
2287        tp->window_clamp = 0;
2288        tcp_set_ca_state(sk, TCP_CA_Open);
2289        tcp_clear_retrans(tp);
2290        inet_csk_delack_init(sk);
2291        tcp_init_send_head(sk);
2292        memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2293        __sk_dst_reset(sk);
2294
2295        WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
2296
2297        sk->sk_error_report(sk);
2298        return err;
2299}
2300EXPORT_SYMBOL(tcp_disconnect);
2301
2302static inline bool tcp_can_repair_sock(const struct sock *sk)
2303{
2304        return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
2305                ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
2306}
2307
2308static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len)
2309{
2310        struct tcp_repair_window opt;
2311
2312        if (!tp->repair)
2313                return -EPERM;
2314
2315        if (len != sizeof(opt))
2316                return -EINVAL;
2317
2318        if (copy_from_user(&opt, optbuf, sizeof(opt)))
2319                return -EFAULT;
2320
2321        if (opt.max_window < opt.snd_wnd)
2322                return -EINVAL;
2323
2324        if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd))
2325                return -EINVAL;
2326
2327        if (after(opt.rcv_wup, tp->rcv_nxt))
2328                return -EINVAL;
2329
2330        tp->snd_wl1     = opt.snd_wl1;
2331        tp->snd_wnd     = opt.snd_wnd;
2332        tp->max_window  = opt.max_window;
2333
2334        tp->rcv_wnd     = opt.rcv_wnd;
2335        tp->rcv_wup     = opt.rcv_wup;
2336
2337        return 0;
2338}
2339
2340static int tcp_repair_options_est(struct tcp_sock *tp,
2341                struct tcp_repair_opt __user *optbuf, unsigned int len)
2342{
2343        struct tcp_repair_opt opt;
2344
2345        while (len >= sizeof(opt)) {
2346                if (copy_from_user(&opt, optbuf, sizeof(opt)))
2347                        return -EFAULT;
2348
2349                optbuf++;
2350                len -= sizeof(opt);
2351
2352                switch (opt.opt_code) {
2353                case TCPOPT_MSS:
2354                        tp->rx_opt.mss_clamp = opt.opt_val;
2355                        break;
2356                case TCPOPT_WINDOW:
2357                        {
2358                                u16 snd_wscale = opt.opt_val & 0xFFFF;
2359                                u16 rcv_wscale = opt.opt_val >> 16;
2360
2361                                if (snd_wscale > 14 || rcv_wscale > 14)
2362                                        return -EFBIG;
2363
2364                                tp->rx_opt.snd_wscale = snd_wscale;
2365                                tp->rx_opt.rcv_wscale = rcv_wscale;
2366                                tp->rx_opt.wscale_ok = 1;
2367                        }
2368                        break;
2369                case TCPOPT_SACK_PERM:
2370                        if (opt.opt_val != 0)
2371                                return -EINVAL;
2372
2373                        tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
2374                        if (sysctl_tcp_fack)
2375                                tcp_enable_fack(tp);
2376                        break;
2377                case TCPOPT_TIMESTAMP:
2378                        if (opt.opt_val != 0)
2379                                return -EINVAL;
2380
2381                        tp->rx_opt.tstamp_ok = 1;
2382                        break;
2383                }
2384        }
2385
2386        return 0;
2387}
2388
2389/*
2390 *      Socket option code for TCP.
2391 */
2392static int do_tcp_setsockopt(struct sock *sk, int level,
2393                int optname, char __user *optval, unsigned int optlen)
2394{
2395        struct tcp_sock *tp = tcp_sk(sk);
2396        struct inet_connection_sock *icsk = inet_csk(sk);
2397        struct net *net = sock_net(sk);
2398        int val;
2399        int err = 0;
2400
2401        /* These are data/string values, all the others are ints */
2402        switch (optname) {
2403        case TCP_CONGESTION: {
2404                char name[TCP_CA_NAME_MAX];
2405
2406                if (optlen < 1)
2407                        return -EINVAL;
2408
2409                val = strncpy_from_user(name, optval,
2410                                        min_t(long, TCP_CA_NAME_MAX-1, optlen));
2411                if (val < 0)
2412                        return -EFAULT;
2413                name[val] = 0;
2414
2415                lock_sock(sk);
2416                err = tcp_set_congestion_control(sk, name);
2417                release_sock(sk);
2418                return err;
2419        }
2420        default:
2421                /* fallthru */
2422                break;
2423        }
2424
2425        if (optlen < sizeof(int))
2426                return -EINVAL;
2427
2428        if (get_user(val, (int __user *)optval))
2429                return -EFAULT;
2430
2431        lock_sock(sk);
2432
2433        switch (optname) {
2434        case TCP_MAXSEG:
2435                /* Values greater than interface MTU won't take effect. However
2436                 * at the point when this call is done we typically don't yet
2437                 * know which interface is going to be used */
2438                if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) {
2439                        err = -EINVAL;
2440                        break;
2441                }
2442                tp->rx_opt.user_mss = val;
2443                break;
2444
2445        case TCP_NODELAY:
2446                if (val) {
2447                        /* TCP_NODELAY is weaker than TCP_CORK, so that
2448                         * this option on corked socket is remembered, but
2449                         * it is not activated until cork is cleared.
2450                         *
2451                         * However, when TCP_NODELAY is set we make
2452                         * an explicit push, which overrides even TCP_CORK
2453                         * for currently queued segments.
2454                         */
2455                        tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2456                        tcp_push_pending_frames(sk);
2457                } else {
2458                        tp->nonagle &= ~TCP_NAGLE_OFF;
2459                }
2460                break;
2461
2462        case TCP_THIN_LINEAR_TIMEOUTS:
2463                if (val < 0 || val > 1)
2464                        err = -EINVAL;
2465                else
2466                        tp->thin_lto = val;
2467                break;
2468
2469        case TCP_THIN_DUPACK:
2470                if (val < 0 || val > 1)
2471                        err = -EINVAL;
2472                else {
2473                        tp->thin_dupack = val;
2474                        if (tp->thin_dupack)
2475                                tcp_disable_early_retrans(tp);
2476                }
2477                break;
2478
2479        case TCP_REPAIR:
2480                if (!tcp_can_repair_sock(sk))
2481                        err = -EPERM;
2482                else if (val == 1) {
2483                        tp->repair = 1;
2484                        sk->sk_reuse = SK_FORCE_REUSE;
2485                        tp->repair_queue = TCP_NO_QUEUE;
2486                } else if (val == 0) {
2487                        tp->repair = 0;
2488                        sk->sk_reuse = SK_NO_REUSE;
2489                        tcp_send_window_probe(sk);
2490                } else
2491                        err = -EINVAL;
2492
2493                break;
2494
2495        case TCP_REPAIR_QUEUE:
2496                if (!tp->repair)
2497                        err = -EPERM;
2498                else if (val < TCP_QUEUES_NR)
2499                        tp->repair_queue = val;
2500                else
2501                        err = -EINVAL;
2502                break;
2503
2504        case TCP_QUEUE_SEQ:
2505                if (sk->sk_state != TCP_CLOSE)
2506                        err = -EPERM;
2507                else if (tp->repair_queue == TCP_SEND_QUEUE)
2508                        tp->write_seq = val;
2509                else if (tp->repair_queue == TCP_RECV_QUEUE)
2510                        tp->rcv_nxt = val;
2511                else
2512                        err = -EINVAL;
2513                break;
2514
2515        case TCP_REPAIR_OPTIONS:
2516                if (!tp->repair)
2517                        err = -EINVAL;
2518                else if (sk->sk_state == TCP_ESTABLISHED)
2519                        err = tcp_repair_options_est(tp,
2520                                        (struct tcp_repair_opt __user *)optval,
2521                                        optlen);
2522                else
2523                        err = -EPERM;
2524                break;
2525
2526        case TCP_CORK:
2527                /* When set indicates to always queue non-full frames.
2528                 * Later the user clears this option and we transmit
2529                 * any pending partial frames in the queue.  This is
2530                 * meant to be used alongside sendfile() to get properly
2531                 * filled frames when the user (for example) must write
2532                 * out headers with a write() call first and then use
2533                 * sendfile to send out the data parts.
2534                 *
2535                 * TCP_CORK can be set together with TCP_NODELAY and it is
2536                 * stronger than TCP_NODELAY.
2537                 */
2538                if (val) {
2539                        tp->nonagle |= TCP_NAGLE_CORK;
2540                } else {
2541                        tp->nonagle &= ~TCP_NAGLE_CORK;
2542                        if (tp->nonagle&TCP_NAGLE_OFF)
2543                                tp->nonagle |= TCP_NAGLE_PUSH;
2544                        tcp_push_pending_frames(sk);
2545                }
2546                break;
2547
2548        case TCP_KEEPIDLE:
2549                if (val < 1 || val > MAX_TCP_KEEPIDLE)
2550                        err = -EINVAL;
2551                else {
2552                        tp->keepalive_time = val * HZ;
2553                        if (sock_flag(sk, SOCK_KEEPOPEN) &&
2554                            !((1 << sk->sk_state) &
2555                              (TCPF_CLOSE | TCPF_LISTEN))) {
2556                                u32 elapsed = keepalive_time_elapsed(tp);
2557                                if (tp->keepalive_time > elapsed)
2558                                        elapsed = tp->keepalive_time - elapsed;
2559                                else
2560                                        elapsed = 0;
2561                                inet_csk_reset_keepalive_timer(sk, elapsed);
2562                        }
2563                }
2564                break;
2565        case TCP_KEEPINTVL:
2566                if (val < 1 || val > MAX_TCP_KEEPINTVL)
2567                        err = -EINVAL;
2568                else
2569                        tp->keepalive_intvl = val * HZ;
2570                break;
2571        case TCP_KEEPCNT:
2572                if (val < 1 || val > MAX_TCP_KEEPCNT)
2573                        err = -EINVAL;
2574                else
2575                        tp->keepalive_probes = val;
2576                break;
2577        case TCP_SYNCNT:
2578                if (val < 1 || val > MAX_TCP_SYNCNT)
2579                        err = -EINVAL;
2580                else
2581                        icsk->icsk_syn_retries = val;
2582                break;
2583
2584        case TCP_SAVE_SYN:
2585                if (val < 0 || val > 1)
2586                        err = -EINVAL;
2587                else
2588                        tp->save_syn = val;
2589                break;
2590
2591        case TCP_LINGER2:
2592                if (val < 0)
2593                        tp->linger2 = -1;
2594                else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ)
2595                        tp->linger2 = 0;
2596                else
2597                        tp->linger2 = val * HZ;
2598                break;
2599
2600        case TCP_DEFER_ACCEPT:
2601                /* Translate value in seconds to number of retransmits */
2602                icsk->icsk_accept_queue.rskq_defer_accept =
2603                        secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
2604                                        TCP_RTO_MAX / HZ);
2605                break;
2606
2607        case TCP_WINDOW_CLAMP:
2608                if (!val) {
2609                        if (sk->sk_state != TCP_CLOSE) {
2610                                err = -EINVAL;
2611                                break;
2612                        }
2613                        tp->window_clamp = 0;
2614                } else
2615                        tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2616                                                SOCK_MIN_RCVBUF / 2 : val;
2617                break;
2618
2619        case TCP_QUICKACK:
2620                if (!val) {
2621                        icsk->icsk_ack.pingpong = 1;
2622                } else {
2623                        icsk->icsk_ack.pingpong = 0;
2624                        if ((1 << sk->sk_state) &
2625                            (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2626                            inet_csk_ack_scheduled(sk)) {
2627                                icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
2628                                tcp_cleanup_rbuf(sk, 1);
2629                                if (!(val & 1))
2630                                        icsk->icsk_ack.pingpong = 1;
2631                        }
2632                }
2633                break;
2634
2635#ifdef CONFIG_TCP_MD5SIG
2636        case TCP_MD5SIG:
2637                /* Read the IP->Key mappings from userspace */
2638                err = tp->af_specific->md5_parse(sk, optval, optlen);
2639                break;
2640#endif
2641        case TCP_USER_TIMEOUT:
2642                /* Cap the max time in ms TCP will retry or probe the window
2643                 * before giving up and aborting (ETIMEDOUT) a connection.
2644                 */
2645                if (val < 0)
2646                        err = -EINVAL;
2647                else
2648                        icsk->icsk_user_timeout = msecs_to_jiffies(val);
2649                break;
2650
2651        case TCP_FASTOPEN:
2652                if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
2653                    TCPF_LISTEN))) {
2654                        tcp_fastopen_init_key_once(true);
2655
2656                        fastopen_queue_tune(sk, val);
2657                } else {
2658                        err = -EINVAL;
2659                }
2660                break;
2661        case TCP_TIMESTAMP:
2662                if (!tp->repair)
2663                        err = -EPERM;
2664                else
2665                        tp->tsoffset = val - tcp_time_stamp;
2666                break;
2667        case TCP_REPAIR_WINDOW:
2668                err = tcp_repair_set_window(tp, optval, optlen);
2669                break;
2670        case TCP_NOTSENT_LOWAT:
2671                tp->notsent_lowat = val;
2672                sk->sk_write_space(sk);
2673                break;
2674        default:
2675                err = -ENOPROTOOPT;
2676                break;
2677        }
2678
2679        release_sock(sk);
2680        return err;
2681}
2682
2683int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2684                   unsigned int optlen)
2685{
2686        const struct inet_connection_sock *icsk = inet_csk(sk);
2687
2688        if (level != SOL_TCP)
2689                return icsk->icsk_af_ops->setsockopt(sk, level, optname,
2690                                                     optval, optlen);
2691        return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2692}
2693EXPORT_SYMBOL(tcp_setsockopt);
2694
2695#ifdef CONFIG_COMPAT
2696int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2697                          char __user *optval, unsigned int optlen)
2698{
2699        if (level != SOL_TCP)
2700                return inet_csk_compat_setsockopt(sk, level, optname,
2701                                                  optval, optlen);
2702        return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2703}
2704EXPORT_SYMBOL(compat_tcp_setsockopt);
2705#endif
2706
2707/* Return information about state of tcp endpoint in API format. */
2708void tcp_get_info(struct sock *sk, struct tcp_info *info)
2709{
2710        const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
2711        const struct inet_connection_sock *icsk = inet_csk(sk);
2712        u32 now = tcp_time_stamp, intv;
2713        unsigned int start;
2714        int notsent_bytes;
2715        u64 rate64;
2716        u32 rate;
2717
2718        memset(info, 0, sizeof(*info));
2719        if (sk->sk_type != SOCK_STREAM)
2720                return;
2721
2722        info->tcpi_state = sk_state_load(sk);
2723
2724        info->tcpi_ca_state = icsk->icsk_ca_state;
2725        info->tcpi_retransmits = icsk->icsk_retransmits;
2726        info->tcpi_probes = icsk->icsk_probes_out;
2727        info->tcpi_backoff = icsk->icsk_backoff;
2728
2729        if (tp->rx_opt.tstamp_ok)
2730                info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2731        if (tcp_is_sack(tp))
2732                info->tcpi_options |= TCPI_OPT_SACK;
2733        if (tp->rx_opt.wscale_ok) {
2734                info->tcpi_options |= TCPI_OPT_WSCALE;
2735                info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2736                info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2737        }
2738
2739        if (tp->ecn_flags & TCP_ECN_OK)
2740                info->tcpi_options |= TCPI_OPT_ECN;
2741        if (tp->ecn_flags & TCP_ECN_SEEN)
2742                info->tcpi_options |= TCPI_OPT_ECN_SEEN;
2743        if (tp->syn_data_acked)
2744                info->tcpi_options |= TCPI_OPT_SYN_DATA;
2745
2746        info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2747        info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2748        info->tcpi_snd_mss = tp->mss_cache;
2749        info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2750
2751        if (info->tcpi_state == TCP_LISTEN) {
2752                info->tcpi_unacked = sk->sk_ack_backlog;
2753                info->tcpi_sacked = sk->sk_max_ack_backlog;
2754        } else {
2755                info->tcpi_unacked = tp->packets_out;
2756                info->tcpi_sacked = tp->sacked_out;
2757        }
2758        info->tcpi_lost = tp->lost_out;
2759        info->tcpi_retrans = tp->retrans_out;
2760        info->tcpi_fackets = tp->fackets_out;
2761
2762        info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2763        info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2764        info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2765
2766        info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
2767        info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2768        info->tcpi_rtt = tp->srtt_us >> 3;
2769        info->tcpi_rttvar = tp->mdev_us >> 2;
2770        info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2771        info->tcpi_snd_cwnd = tp->snd_cwnd;
2772        info->tcpi_advmss = tp->advmss;
2773        info->tcpi_reordering = tp->reordering;
2774
2775        info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2776        info->tcpi_rcv_space = tp->rcvq_space.space;
2777
2778        info->tcpi_total_retrans = tp->total_retrans;
2779
2780        rate = READ_ONCE(sk->sk_pacing_rate);
2781        rate64 = rate != ~0U ? rate : ~0ULL;
2782        put_unaligned(rate64, &info->tcpi_pacing_rate);
2783
2784        rate = READ_ONCE(sk->sk_max_pacing_rate);
2785        rate64 = rate != ~0U ? rate : ~0ULL;
2786        put_unaligned(rate64, &info->tcpi_max_pacing_rate);
2787
2788        do {
2789                start = u64_stats_fetch_begin_irq(&tp->syncp);
2790                put_unaligned(tp->bytes_acked, &info->tcpi_bytes_acked);
2791                put_unaligned(tp->bytes_received, &info->tcpi_bytes_received);
2792        } while (u64_stats_fetch_retry_irq(&tp->syncp, start));
2793        info->tcpi_segs_out = tp->segs_out;
2794        info->tcpi_segs_in = tp->segs_in;
2795
2796        notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt);
2797        info->tcpi_notsent_bytes = max(0, notsent_bytes);
2798
2799        info->tcpi_min_rtt = tcp_min_rtt(tp);
2800        info->tcpi_data_segs_in = tp->data_segs_in;
2801        info->tcpi_data_segs_out = tp->data_segs_out;
2802
2803        info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
2804        rate = READ_ONCE(tp->rate_delivered);
2805        intv = READ_ONCE(tp->rate_interval_us);
2806        if (rate && intv) {
2807                rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
2808                do_div(rate64, intv);
2809                put_unaligned(rate64, &info->tcpi_delivery_rate);
2810        }
2811}
2812EXPORT_SYMBOL_GPL(tcp_get_info);
2813
2814static int do_tcp_getsockopt(struct sock *sk, int level,
2815                int optname, char __user *optval, int __user *optlen)
2816{
2817        struct inet_connection_sock *icsk = inet_csk(sk);
2818        struct tcp_sock *tp = tcp_sk(sk);
2819        struct net *net = sock_net(sk);
2820        int val, len;
2821
2822        if (get_user(len, optlen))
2823                return -EFAULT;
2824
2825        len = min_t(unsigned int, len, sizeof(int));
2826
2827        if (len < 0)
2828                return -EINVAL;
2829
2830        switch (optname) {
2831        case TCP_MAXSEG:
2832                val = tp->mss_cache;
2833                if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2834                        val = tp->rx_opt.user_mss;
2835                if (tp->repair)
2836                        val = tp->rx_opt.mss_clamp;
2837                break;
2838        case TCP_NODELAY:
2839                val = !!(tp->nonagle&TCP_NAGLE_OFF);
2840                break;
2841        case TCP_CORK:
2842                val = !!(tp->nonagle&TCP_NAGLE_CORK);
2843                break;
2844        case TCP_KEEPIDLE:
2845                val = keepalive_time_when(tp) / HZ;
2846                break;
2847        case TCP_KEEPINTVL:
2848                val = keepalive_intvl_when(tp) / HZ;
2849                break;
2850        case TCP_KEEPCNT:
2851                val = keepalive_probes(tp);
2852                break;
2853        case TCP_SYNCNT:
2854                val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
2855                break;
2856        case TCP_LINGER2:
2857                val = tp->linger2;
2858                if (val >= 0)
2859                        val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ;
2860                break;
2861        case TCP_DEFER_ACCEPT:
2862                val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
2863                                      TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
2864                break;
2865        case TCP_WINDOW_CLAMP:
2866                val = tp->window_clamp;
2867                break;
2868        case TCP_INFO: {
2869                struct tcp_info info;
2870
2871                if (get_user(len, optlen))
2872                        return -EFAULT;
2873
2874                tcp_get_info(sk, &info);
2875
2876                len = min_t(unsigned int, len, sizeof(info));
2877                if (put_user(len, optlen))
2878                        return -EFAULT;
2879                if (copy_to_user(optval, &info, len))
2880                        return -EFAULT;
2881                return 0;
2882        }
2883        case TCP_CC_INFO: {
2884                const struct tcp_congestion_ops *ca_ops;
2885                union tcp_cc_info info;
2886                size_t sz = 0;
2887                int attr;
2888
2889                if (get_user(len, optlen))
2890                        return -EFAULT;
2891
2892                ca_ops = icsk->icsk_ca_ops;
2893                if (ca_ops && ca_ops->get_info)
2894                        sz = ca_ops->get_info(sk, ~0U, &attr, &info);
2895
2896                len = min_t(unsigned int, len, sz);
2897                if (put_user(len, optlen))
2898                        return -EFAULT;
2899                if (copy_to_user(optval, &info, len))
2900                        return -EFAULT;
2901                return 0;
2902        }
2903        case TCP_QUICKACK:
2904                val = !icsk->icsk_ack.pingpong;
2905                break;
2906
2907        case TCP_CONGESTION:
2908                if (get_user(len, optlen))
2909                        return -EFAULT;
2910                len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2911                if (put_user(len, optlen))
2912                        return -EFAULT;
2913                if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2914                        return -EFAULT;
2915                return 0;
2916
2917        case TCP_THIN_LINEAR_TIMEOUTS:
2918                val = tp->thin_lto;
2919                break;
2920        case TCP_THIN_DUPACK:
2921                val = tp->thin_dupack;
2922                break;
2923
2924        case TCP_REPAIR:
2925                val = tp->repair;
2926                break;
2927
2928        case TCP_REPAIR_QUEUE:
2929                if (tp->repair)
2930                        val = tp->repair_queue;
2931                else
2932                        return -EINVAL;
2933                break;
2934
2935        case TCP_REPAIR_WINDOW: {
2936                struct tcp_repair_window opt;
2937
2938                if (get_user(len, optlen))
2939                        return -EFAULT;
2940
2941                if (len != sizeof(opt))
2942                        return -EINVAL;
2943
2944                if (!tp->repair)
2945                        return -EPERM;
2946
2947                opt.snd_wl1     = tp->snd_wl1;
2948                opt.snd_wnd     = tp->snd_wnd;
2949                opt.max_window  = tp->max_window;
2950                opt.rcv_wnd     = tp->rcv_wnd;
2951                opt.rcv_wup     = tp->rcv_wup;
2952
2953                if (copy_to_user(optval, &opt, len))
2954                        return -EFAULT;
2955                return 0;
2956        }
2957        case TCP_QUEUE_SEQ:
2958                if (tp->repair_queue == TCP_SEND_QUEUE)
2959                        val = tp->write_seq;
2960                else if (tp->repair_queue == TCP_RECV_QUEUE)
2961                        val = tp->rcv_nxt;
2962                else
2963                        return -EINVAL;
2964                break;
2965
2966        case TCP_USER_TIMEOUT:
2967                val = jiffies_to_msecs(icsk->icsk_user_timeout);
2968                break;
2969
2970        case TCP_FASTOPEN:
2971                val = icsk->icsk_accept_queue.fastopenq.max_qlen;
2972                break;
2973
2974        case TCP_TIMESTAMP:
2975                val = tcp_time_stamp + tp->tsoffset;
2976                break;
2977        case TCP_NOTSENT_LOWAT:
2978                val = tp->notsent_lowat;
2979                break;
2980        case TCP_SAVE_SYN:
2981                val = tp->save_syn;
2982                break;
2983        case TCP_SAVED_SYN: {
2984                if (get_user(len, optlen))
2985                        return -EFAULT;
2986
2987                lock_sock(sk);
2988                if (tp->saved_syn) {
2989                        if (len < tp->saved_syn[0]) {
2990                                if (put_user(tp->saved_syn[0], optlen)) {
2991                                        release_sock(sk);
2992                                        return -EFAULT;
2993                                }
2994                                release_sock(sk);
2995                                return -EINVAL;
2996                        }
2997                        len = tp->saved_syn[0];
2998                        if (put_user(len, optlen)) {
2999                                release_sock(sk);
3000                                return -EFAULT;
3001                        }
3002                        if (copy_to_user(optval, tp->saved_syn + 1, len)) {
3003                                release_sock(sk);
3004                                return -EFAULT;
3005                        }
3006                        tcp_saved_syn_free(tp);
3007                        release_sock(sk);
3008                } else {
3009                        release_sock(sk);
3010                        len = 0;
3011                        if (put_user(len, optlen))
3012                                return -EFAULT;
3013                }
3014                return 0;
3015        }
3016        default:
3017                return -ENOPROTOOPT;
3018        }
3019
3020        if (put_user(len, optlen))
3021                return -EFAULT;
3022        if (copy_to_user(optval, &val, len))
3023                return -EFAULT;
3024        return 0;
3025}
3026
3027int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
3028                   int __user *optlen)
3029{
3030        struct inet_connection_sock *icsk = inet_csk(sk);
3031
3032        if (level != SOL_TCP)
3033                return icsk->icsk_af_ops->getsockopt(sk, level, optname,
3034                                                     optval, optlen);
3035        return do_tcp_getsockopt(sk, level, optname, optval, optlen);
3036}
3037EXPORT_SYMBOL(tcp_getsockopt);
3038
3039#ifdef CONFIG_COMPAT
3040int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
3041                          char __user *optval, int __user *optlen)
3042{
3043        if (level != SOL_TCP)
3044                return inet_csk_compat_getsockopt(sk, level, optname,
3045                                                  optval, optlen);
3046        return do_tcp_getsockopt(sk, level, optname, optval, optlen);
3047}
3048EXPORT_SYMBOL(compat_tcp_getsockopt);
3049#endif
3050
3051#ifdef CONFIG_TCP_MD5SIG
3052static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
3053static DEFINE_MUTEX(tcp_md5sig_mutex);
3054static bool tcp_md5sig_pool_populated = false;
3055
3056static void __tcp_alloc_md5sig_pool(void)
3057{
3058        struct crypto_ahash *hash;
3059        int cpu;
3060
3061        hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
3062        if (IS_ERR(hash))
3063                return;
3064
3065        for_each_possible_cpu(cpu) {
3066                void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch;
3067                struct ahash_request *req;
3068
3069                if (!scratch) {
3070                        scratch = kmalloc_node(sizeof(union tcp_md5sum_block) +
3071                                               sizeof(struct tcphdr),
3072                                               GFP_KERNEL,
3073                                               cpu_to_node(cpu));
3074                        if (!scratch)
3075                                return;
3076                        per_cpu(tcp_md5sig_pool, cpu).scratch = scratch;
3077                }
3078                if (per_cpu(tcp_md5sig_pool, cpu).md5_req)
3079                        continue;
3080
3081                req = ahash_request_alloc(hash, GFP_KERNEL);
3082                if (!req)
3083                        return;
3084
3085                ahash_request_set_callback(req, 0, NULL, NULL);
3086
3087                per_cpu(tcp_md5sig_pool, cpu).md5_req = req;
3088        }
3089        /* before setting tcp_md5sig_pool_populated, we must commit all writes
3090         * to memory. See smp_rmb() in tcp_get_md5sig_pool()
3091         */
3092        smp_wmb();
3093        tcp_md5sig_pool_populated = true;
3094}
3095
3096bool tcp_alloc_md5sig_pool(void)
3097{
3098        if (unlikely(!tcp_md5sig_pool_populated)) {
3099                mutex_lock(&tcp_md5sig_mutex);
3100
3101                if (!tcp_md5sig_pool_populated)
3102                        __tcp_alloc_md5sig_pool();
3103
3104                mutex_unlock(&tcp_md5sig_mutex);
3105        }
3106        return tcp_md5sig_pool_populated;
3107}
3108EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
3109
3110
3111/**
3112 *      tcp_get_md5sig_pool - get md5sig_pool for this user
3113 *
3114 *      We use percpu structure, so if we succeed, we exit with preemption
3115 *      and BH disabled, to make sure another thread or softirq handling
3116 *      wont try to get same context.
3117 */
3118struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
3119{
3120        local_bh_disable();
3121
3122        if (tcp_md5sig_pool_populated) {
3123                /* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */
3124                smp_rmb();
3125                return this_cpu_ptr(&tcp_md5sig_pool);
3126        }
3127        local_bh_enable();
3128        return NULL;
3129}
3130EXPORT_SYMBOL(tcp_get_md5sig_pool);
3131
3132int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
3133                          const struct sk_buff *skb, unsigned int header_len)
3134{
3135        struct scatterlist sg;
3136        const struct tcphdr *tp = tcp_hdr(skb);
3137        struct ahash_request *req = hp->md5_req;
3138        unsigned int i;
3139        const unsigned int head_data_len = skb_headlen(skb) > header_len ?
3140                                           skb_headlen(skb) - header_len : 0;
3141        const struct skb_shared_info *shi = skb_shinfo(skb);
3142        struct sk_buff *frag_iter;
3143
3144        sg_init_table(&sg, 1);
3145
3146        sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
3147        ahash_request_set_crypt(req, &sg, NULL, head_data_len);
3148        if (crypto_ahash_update(req))
3149                return 1;
3150
3151        for (i = 0; i < shi->nr_frags; ++i) {
3152                const struct skb_frag_struct *f = &shi->frags[i];
3153                unsigned int offset = f->page_offset;
3154                struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
3155
3156                sg_set_page(&sg, page, skb_frag_size(f),
3157                            offset_in_page(offset));
3158                ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f));
3159                if (crypto_ahash_update(req))
3160                        return 1;
3161        }
3162
3163        skb_walk_frags(skb, frag_iter)
3164                if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
3165                        return 1;
3166
3167        return 0;
3168}
3169EXPORT_SYMBOL(tcp_md5_hash_skb_data);
3170
3171int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
3172{
3173        struct scatterlist sg;
3174
3175        sg_init_one(&sg, key->key, key->keylen);
3176        ahash_request_set_crypt(hp->md5_req, &sg, NULL, key->keylen);
3177        return crypto_ahash_update(hp->md5_req);
3178}
3179EXPORT_SYMBOL(tcp_md5_hash_key);
3180
3181#endif
3182
3183void tcp_done(struct sock *sk)
3184{
3185        struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
3186
3187        if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3188                TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3189
3190        tcp_set_state(sk, TCP_CLOSE);
3191        tcp_clear_xmit_timers(sk);
3192        if (req)
3193                reqsk_fastopen_remove(sk, req, false);
3194
3195        sk->sk_shutdown = SHUTDOWN_MASK;
3196
3197        if (!sock_flag(sk, SOCK_DEAD))
3198                sk->sk_state_change(sk);
3199        else
3200                inet_csk_destroy_sock(sk);
3201}
3202EXPORT_SYMBOL_GPL(tcp_done);
3203
3204int tcp_abort(struct sock *sk, int err)
3205{
3206        if (!sk_fullsock(sk)) {
3207                if (sk->sk_state == TCP_NEW_SYN_RECV) {
3208                        struct request_sock *req = inet_reqsk(sk);
3209
3210                        local_bh_disable();
3211                        inet_csk_reqsk_queue_drop_and_put(req->rsk_listener,
3212                                                          req);
3213                        local_bh_enable();
3214                        return 0;
3215                }
3216                return -EOPNOTSUPP;
3217        }
3218
3219        /* Don't race with userspace socket closes such as tcp_close. */
3220        lock_sock(sk);
3221
3222        if (sk->sk_state == TCP_LISTEN) {
3223                tcp_set_state(sk, TCP_CLOSE);
3224                inet_csk_listen_stop(sk);
3225        }
3226
3227        /* Don't race with BH socket closes such as inet_csk_listen_stop. */
3228        local_bh_disable();
3229        bh_lock_sock(sk);
3230
3231        if (!sock_flag(sk, SOCK_DEAD)) {
3232                sk->sk_err = err;
3233                /* This barrier is coupled with smp_rmb() in tcp_poll() */
3234                smp_wmb();
3235                sk->sk_error_report(sk);
3236                if (tcp_need_reset(sk->sk_state))
3237                        tcp_send_active_reset(sk, GFP_ATOMIC);
3238                tcp_done(sk);
3239        }
3240
3241        bh_unlock_sock(sk);
3242        local_bh_enable();
3243        release_sock(sk);
3244        return 0;
3245}
3246EXPORT_SYMBOL_GPL(tcp_abort);
3247
3248extern struct tcp_congestion_ops tcp_reno;
3249
3250static __initdata unsigned long thash_entries;
3251static int __init set_thash_entries(char *str)
3252{
3253        ssize_t ret;
3254
3255        if (!str)
3256                return 0;
3257
3258        ret = kstrtoul(str, 0, &thash_entries);
3259        if (ret)
3260                return 0;
3261
3262        return 1;
3263}
3264__setup("thash_entries=", set_thash_entries);
3265
3266static void __init tcp_init_mem(void)
3267{
3268        unsigned long limit = nr_free_buffer_pages() / 16;
3269
3270        limit = max(limit, 128UL);
3271        sysctl_tcp_mem[0] = limit / 4 * 3;              /* 4.68 % */
3272        sysctl_tcp_mem[1] = limit;                      /* 6.25 % */
3273        sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;      /* 9.37 % */
3274}
3275
3276void __init tcp_init(void)
3277{
3278        int max_rshare, max_wshare, cnt;
3279        unsigned long limit;
3280        unsigned int i;
3281
3282        BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
3283                     FIELD_SIZEOF(struct sk_buff, cb));
3284
3285        percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
3286        percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
3287        tcp_hashinfo.bind_bucket_cachep =
3288                kmem_cache_create("tcp_bind_bucket",
3289                                  sizeof(struct inet_bind_bucket), 0,
3290                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3291
3292        /* Size and allocate the main established and bind bucket
3293         * hash tables.
3294         *
3295         * The methodology is similar to that of the buffer cache.
3296         */
3297        tcp_hashinfo.ehash =
3298                alloc_large_system_hash("TCP established",
3299                                        sizeof(struct inet_ehash_bucket),
3300                                        thash_entries,
3301                                        17, /* one slot per 128 KB of memory */
3302                                        0,
3303                                        NULL,
3304                                        &tcp_hashinfo.ehash_mask,
3305                                        0,
3306                                        thash_entries ? 0 : 512 * 1024);
3307        for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
3308                INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
3309
3310        if (inet_ehash_locks_alloc(&tcp_hashinfo))
3311                panic("TCP: failed to alloc ehash_locks");
3312        tcp_hashinfo.bhash =
3313                alloc_large_system_hash("TCP bind",
3314                                        sizeof(struct inet_bind_hashbucket),
3315                                        tcp_hashinfo.ehash_mask + 1,
3316                                        17, /* one slot per 128 KB of memory */
3317                                        0,
3318                                        &tcp_hashinfo.bhash_size,
3319                                        NULL,
3320                                        0,
3321                                        64 * 1024);
3322        tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
3323        for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
3324                spin_lock_init(&tcp_hashinfo.bhash[i].lock);
3325                INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
3326        }
3327
3328
3329        cnt = tcp_hashinfo.ehash_mask + 1;
3330
3331        tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3332        sysctl_tcp_max_orphans = cnt / 2;
3333        sysctl_max_syn_backlog = max(128, cnt / 256);
3334
3335        tcp_init_mem();
3336        /* Set per-socket limits to no more than 1/128 the pressure threshold */
3337        limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
3338        max_wshare = min(4UL*1024*1024, limit);
3339        max_rshare = min(6UL*1024*1024, limit);
3340
3341        sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
3342        sysctl_tcp_wmem[1] = 16*1024;
3343        sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
3344
3345        sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
3346        sysctl_tcp_rmem[1] = 87380;
3347        sysctl_tcp_rmem[2] = max(87380, max_rshare);
3348
3349        pr_info("Hash tables configured (established %u bind %u)\n",
3350                tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3351
3352        tcp_metrics_init();
3353        BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
3354        tcp_tasklet_init();
3355}
3356
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.